diff --git a/.travis.yml b/.travis.yml index d13509805e0f8..a4d323ef8ba95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python env: + global: # scatterci API key #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" @@ -19,6 +20,7 @@ matrix: - NOSE_ARGS="not slow and not network and not disabled" - CLIPBOARD=xclip - LOCALE_OVERRIDE="it_IT.UTF-8" + - BUILD_TYPE=conda - JOB_NAME: "26_nslow_nnet" - python: 2.7 env: @@ -26,12 +28,14 @@ matrix: - LOCALE_OVERRIDE="zh_CN.GB18030" - FULL_DEPS=true - JOB_TAG=_LOCALE + - BUILD_TYPE=conda - JOB_NAME: "27_slow_nnet_LOCALE" - python: 2.7 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=gtk2 + - BUILD_TYPE=conda - JOB_NAME: "27_nslow" - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - python: 3.3 @@ -39,18 +43,21 @@ matrix: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel + - BUILD_TYPE=conda - JOB_NAME: "33_nslow" - python: 3.4 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel + - BUILD_TYPE=conda - JOB_NAME: "34_nslow" - python: 3.2 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=qt4 + - BUILD_TYPE=pydata - JOB_NAME: "32_nslow" - python: 2.7 env: @@ -59,6 +66,7 @@ matrix: - JOB_NAME: "27_numpy_master" - JOB_TAG=_NUMPY_DEV_master - NUMPY_BUILD=master + - BUILD_TYPE=pydata - PANDAS_TESTING_MODE="deprecate" allow_failures: - python: 3.2 @@ -66,6 +74,7 @@ matrix: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=qt4 + - BUILD_TYPE=pydata - JOB_NAME: "32_nslow" - python: 2.7 env: @@ -74,11 +83,14 @@ matrix: - JOB_NAME: "27_numpy_master" - JOB_TAG=_NUMPY_DEV_master - NUMPY_BUILD=master + - BUILD_TYPE=pydata - PANDAS_TESTING_MODE="deprecate" before_install: - echo "before_install" - echo $VIRTUAL_ENV + - export PATH="$HOME/miniconda/bin:$PATH" + - sudo apt-get install ccache - df -h - date - pwd @@ -92,7 +104,7 @@ before_install: install: - echo "install" - ci/prep_ccache.sh - - ci/install.sh + - ci/install_${BUILD_TYPE}.sh - ci/submit_ccache.sh before_script: @@ -106,6 +118,6 @@ script: after_script: - if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi - - ci/print_versions.py + - source activate pandas && ci/print_versions.py - ci/print_skipped.py /tmp/nosetests.xml - ci/after_script.sh diff --git a/README.md b/README.md index 6a645dc64123d..93bfe7482d31e 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ pip install pandas - Needed for time zone support with ``pandas.date_range`` ### Highly Recommended Dependencies -- [numexpr](http://code.google.com/p/numexpr/) +- [numexpr](https://github.com/pydata/numexpr) - Needed to accelerate some expression evaluation operations - Required by PyTables - [bottleneck](http://berkeleyanalytics.com/bottleneck) @@ -218,7 +218,6 @@ has been under active development since then. Since pandas development is related to a number of other scientific Python projects, questions are welcome on the scipy-user mailing list. Specialized discussions or design issues should take place on -the pystatsmodels mailing list / Google group, where -``scikits.statsmodels`` and other libraries will also be discussed: +the PyData mailing list / Google group: -http://groups.google.com/group/pystatsmodels +https://groups.google.com/forum/#!forum/pydata diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000000000..9cec7895f1493 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,38 @@ +environment: + global: + # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the + # /E:ON and /V:ON options are not enabled in the batch script intepreter + # See: http://stackoverflow.com/a/13751649/163740 + CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" + + matrix: + - PYTHON: "C:\\Python27_32" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python27_64" + PYTHON_VERSION: "2.7" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python34_32" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python34_64" + PYTHON_VERSION: "3.4" + PYTHON_ARCH: "64" + +install: + # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit), + # as well as pip, conda-build, and the binstar CLI + - echo "install" + - cd + - ls -ltr + - powershell .\\ci\\install_appveyor.ps1 + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + +build: false + +test_script: + - "%CMD_IN_ENV% %PYTHON%/python.exe setup.py build_ext --inplace" + - "%PYTHON%/Scripts/nosetests -A \"not slow and not network and not disabled\" pandas" diff --git a/ci/install_appveyor.ps1 b/ci/install_appveyor.ps1 new file mode 100644 index 0000000000000..a022995dc7d58 --- /dev/null +++ b/ci/install_appveyor.ps1 @@ -0,0 +1,133 @@ +# Sample script to install Miniconda under Windows +# Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon +# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ + +$MINICONDA_URL = "http://repo.continuum.io/miniconda/" + + +function DownloadMiniconda ($python_version, $platform_suffix) { + $webclient = New-Object System.Net.WebClient + if ($python_version -match "3.4") { + $filename = "Miniconda3-3.5.5-Windows-" + $platform_suffix + ".exe" + } else { + $filename = "Miniconda-3.5.5-Windows-" + $platform_suffix + ".exe" + } + $url = $MINICONDA_URL + $filename + + $basedir = $pwd.Path + "\" + $filepath = $basedir + $filename + if (Test-Path $filename) { + Write-Host "Reusing" $filepath + return $filepath + } + + # Download and retry up to 3 times in case of network transient errors. + Write-Host "Downloading" $filename "from" $url + $retry_attempts = 2 + for($i=0; $i -lt $retry_attempts; $i++){ + try { + $webclient.DownloadFile($url, $filepath) + break + } + Catch [Exception]{ + Start-Sleep 1 + } + } + if (Test-Path $filepath) { + Write-Host "File saved at" $filepath + } else { + # Retry once to get the error message if any at the last try + $webclient.DownloadFile($url, $filepath) + } + return $filepath +} + +function Start-Executable { + param( + [String] $FilePath, + [String[]] $ArgumentList + ) + $OFS = " " + $process = New-Object System.Diagnostics.Process + $process.StartInfo.FileName = $FilePath + $process.StartInfo.Arguments = $ArgumentList + $process.StartInfo.UseShellExecute = $false + $process.StartInfo.RedirectStandardOutput = $true + if ( $process.Start() ) { + $output = $process.StandardOutput.ReadToEnd() ` + -replace "\r\n$","" + if ( $output ) { + if ( $output.Contains("`r`n") ) { + $output -split "`r`n" + } + elseif ( $output.Contains("`n") ) { + $output -split "`n" + } + else { + $output + } + } + $process.WaitForExit() + & "$Env:SystemRoot\system32\cmd.exe" ` + /c exit $process.ExitCode + } + } + +function InstallMiniconda ($python_version, $architecture, $python_home) { + Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home + if (Test-Path $python_home) { + Write-Host $python_home "already exists, skipping." + return $false + } + if ($architecture -match "32") { + $platform_suffix = "x86" + } else { + $platform_suffix = "x86_64" + } + + $filepath = DownloadMiniconda $python_version $platform_suffix + Write-Host "Installing" $filepath "to" $python_home + $install_log = $python_home + ".log" + $args = "/S /D=$python_home" + Write-Host $filepath $args + Start-Process -FilePath $filepath -ArgumentList $args -Wait + if (Test-Path $python_home) { + Write-Host "Python $python_version ($architecture) installation complete" + } else { + Write-Host "Failed to install Python in $python_home" + Get-Content -Path $install_log + Exit 1 + } +} + + +function InstallCondaPackages ($python_home, $spec) { + $conda_path = $python_home + "\Scripts\conda.exe" + $args = "install --yes --quiet " + $spec + Write-Host ("conda " + $args) + Start-Executable -FilePath "$conda_path" -ArgumentList $args +} +function InstallCondaPackagesFromFile ($python_home, $ver, $arch) { + $conda_path = $python_home + "\Scripts\conda.exe" + $args = "install --yes --quiet --file " + $env:APPVEYOR_BUILD_FOLDER + "\ci\requirements-" + $ver + "_" + $arch + ".txt" + Write-Host ("conda " + $args) + Start-Executable -FilePath "$conda_path" -ArgumentList $args +} + +function UpdateConda ($python_home) { + $conda_path = $python_home + "\Scripts\conda.exe" + Write-Host "Updating conda..." + $args = "update --yes conda" + Write-Host $conda_path $args + Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait +} + + +function main () { + InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON + UpdateConda $env:PYTHON + InstallCondaPackages $env:PYTHON "pip setuptools nose" + InstallCondaPackagesFromFile $env:PYTHON $env:PYTHON_VERSION $env:PYTHON_ARCH +} + +main \ No newline at end of file diff --git a/ci/install_conda.sh b/ci/install_conda.sh new file mode 100755 index 0000000000000..ec0aa5fef84ae --- /dev/null +++ b/ci/install_conda.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# There are 2 distinct pieces that get zipped and cached +# - The venv site-packages dir including the installed dependencies +# - The pandas build artifacts, using the build cache support via +# scripts/use_build_cache.py +# +# if the user opted in to use the cache and we're on a whitelisted fork +# - if the server doesn't hold a cached version of venv/pandas build, +# do things the slow way, and put the results on the cache server +# for the next time. +# - if the cache files are available, instal some necessaries via apt +# (no compiling needed), then directly goto script and collect 200$. +# + +function edit_init() +{ + if [ -n "$LOCALE_OVERRIDE" ]; then + echo "Adding locale to the first line of pandas/__init__.py" + rm -f pandas/__init__.pyc + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sed -i "$sedc" pandas/__init__.py + echo "head -4 pandas/__init__.py" + head -4 pandas/__init__.py + echo + fi +} + +edit_init + +python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" +[ "$python_major_version" == "2" ] && python_major_version="" + +home_dir=$(pwd) +echo "home_dir: [$home_dir]" + +if [ -n "$LOCALE_OVERRIDE" ]; then + # make sure the locale is available + # probably useless, since you would need to relogin + time sudo locale-gen "$LOCALE_OVERRIDE" +fi + +# Need to enable for locale testing. The location of the locale file(s) is +# distro specific. For example, on Arch Linux all of the locales are in a +# commented file--/etc/locale.gen--that must be commented in to be used +# whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales +# based on what's in the files in that folder +time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it +time sudo locale-gen + + +# install gui for clipboard testing +if [ -n "$CLIPBOARD_GUI" ]; then + echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" + [ -n "$python_major_version" ] && py="py" + python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} + time sudo apt-get $APT_ARGS install $python_cb_gui_pkg +fi + + +# install a clipboard if $CLIPBOARD is not empty +if [ -n "$CLIPBOARD" ]; then + echo "Using clipboard: $CLIPBOARD" + time sudo apt-get $APT_ARGS install $CLIPBOARD +fi + +python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" +[ "$python_major_version" == "2" ] && python_major_version="" + +wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 +bash miniconda.sh -b -p $HOME/miniconda || exit 1 + +conda config --set always_yes yes --set changeps1 no || exit 1 +conda update -q conda || exit 1 +conda config --add channels http://conda.binstar.org/pandas || exit 1 + +# Useful for debugging any issues with conda +conda info -a || exit 1 + +conda create -n pandas python=$TRAVIS_PYTHON_VERSION || exit 1 +conda install -n pandas --file=ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.txt || exit 1 + +conda install -n pandas pip setuptools nose || exit 1 +conda remove -n pandas pandas + +source activate pandas + +# set the compiler cache to work +if [ "$IRON_TOKEN" ]; then + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "gcc: $gcc" + ccache=$(which ccache) + echo "ccache: $ccache" + export CC='ccache gcc' +fi + +python setup.py build_ext --inplace && python setup.py develop + +for package in beautifulsoup4 'python-dateutil'; do + pip uninstall --yes $package +done + +true diff --git a/ci/install.sh b/ci/install_pydata.sh similarity index 96% rename from ci/install.sh rename to ci/install_pydata.sh index fd680011322a4..33a6d3854da22 100755 --- a/ci/install.sh +++ b/ci/install_pydata.sh @@ -43,7 +43,8 @@ echo "home_dir: [$home_dir]" pip install -I -U pip pip install -I -U setuptools pip install wheel==0.22 -pip install nose==1.3.3 +#pip install nose==1.3.3 +pip install nose==1.3.4 # comment this line to disable the fetching of wheel files base_url=http://pandas.pydata.org/pandas-build/dev/wheels @@ -136,11 +137,8 @@ if [ "$IRON_TOKEN" ]; then fi # build pandas -time python setup.py sdist -pip uninstall cython -y - -# install pandas -time pip install $(find dist | grep gz | head -n 1) +python setup.py build_ext --inplace +python setup.py develop # restore cython (if not numpy building) if [ -z "$NUMPY_BUILD" ]; then diff --git a/ci/requirements-2.6.txt b/ci/requirements-2.6.txt index fec0a96a3d077..9b338cee26801 100644 --- a/ci/requirements-2.6.txt +++ b/ci/requirements-2.6.txt @@ -1,16 +1,16 @@ -numpy==1.7.0 -cython==0.19.1 -python-dateutil==1.5 -pytz==2013b -http://www.crummy.com/software/BeautifulSoup/bs4/download/4.2/beautifulsoup4-4.2.0.tar.gz -html5lib==1.0b2 -numexpr==1.4.2 -sqlalchemy==0.7.1 -pymysql==0.6.0 -psycopg2==2.5 -scipy==0.11.0 -statsmodels==0.4.3 -xlwt==0.7.5 -openpyxl==2.0.3 -xlsxwriter==0.4.6 -xlrd==0.9.2 +numpy=1.7.0 +cython=0.19.1 +dateutil=1.5 +pytz=2013b +scipy=0.11.0 +xlwt=0.7.5 +xlrd=0.9.2 +openpyxl=2.0.3 +statsmodels=0.4.3 +html5lib=1.0b2 +beautiful-soup=4.2.0 +psycopg2=2.5.1 +numexpr=1.4.2 +pymysql=0.6.0 +sqlalchemy=0.7.8 +xlsxwriter=0.4.6 diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 5b77428a0f6d7..f3df26e7a0c24 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -1,25 +1,25 @@ -python-dateutil==2.1 -pytz==2013b -xlwt==0.7.5 -numpy==1.8.1 -cython==0.19.1 -bottleneck==0.6.0 -numexpr==2.2.2 -tables==3.0.0 -matplotlib==1.3.1 -openpyxl==1.6.2 -xlsxwriter==0.4.6 -xlrd==0.9.2 -patsy==0.1.0 -sqlalchemy==0.9.6 -pymysql==0.6.1 -psycopg2==2.5.2 -html5lib==1.0b2 -lxml==3.2.1 -scipy==0.13.3 -beautifulsoup4==4.2.1 -statsmodels==0.5.0 -boto==2.26.1 -httplib2==0.8 -python-gflags==2.0 -google-api-python-client==1.2 +dateutil=2.1 +pytz=2013b +xlwt=0.7.5 +numpy=1.7.0 +cython=0.19.1 +numexpr=2.2.2 +pytables=3.0.0 +matplotlib=1.3.1 +openpyxl=1.6.2 +xlrd=0.9.2 +sqlalchemy=0.9.6 +lxml=3.2.1 +scipy +xlsxwriter=0.4.6 +statsmodels +boto=2.26.1 +bottleneck=0.8.0 +psycopg2=2.5.2 +patsy +pymysql=0.6.1 +html5lib=1.0b2 +beautiful-soup=4.2.1 +httplib2=0.8 +python-gflags=2.0 +google-api-python-client=1.2 diff --git a/ci/requirements-2.7_32.txt b/ci/requirements-2.7_32.txt new file mode 100644 index 0000000000000..01b305bb6f21a --- /dev/null +++ b/ci/requirements-2.7_32.txt @@ -0,0 +1,11 @@ +dateutil +pytz +xlwt +numpy +cython +numexpr +pytables +matplotlib +openpyxl +xlrd +scipy diff --git a/ci/requirements-2.7_64.txt b/ci/requirements-2.7_64.txt new file mode 100644 index 0000000000000..01b305bb6f21a --- /dev/null +++ b/ci/requirements-2.7_64.txt @@ -0,0 +1,11 @@ +dateutil +pytz +xlwt +numpy +cython +numexpr +pytables +matplotlib +openpyxl +xlrd +scipy diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt index 9af33fe96d58c..036e597e5b788 100644 --- a/ci/requirements-2.7_LOCALE.txt +++ b/ci/requirements-2.7_LOCALE.txt @@ -1,18 +1,18 @@ -python-dateutil -pytz==2013b -xlwt==0.7.5 -openpyxl==1.6.2 -xlsxwriter==0.4.6 -xlrd==0.9.2 -numpy==1.7.1 -cython==0.19.1 -bottleneck==0.6.0 -matplotlib==1.3.0 -patsy==0.1.0 -sqlalchemy==0.8.1 -html5lib==1.0b2 -lxml==3.2.1 -scipy==0.10.0 -beautifulsoup4==4.2.1 -statsmodels==0.4.3 -bigquery==2.0.17 +dateutil +pytz=2013b +xlwt=0.7.5 +openpyxl=1.6.2 +xlsxwriter=0.4.6 +xlrd=0.9.2 +numpy=1.7.1 +cython=0.19.1 +bottleneck=0.8.0 +matplotlib=1.3.0 +patsy=0.1.0 +sqlalchemy=0.8.1 +html5lib=1.0b2 +lxml=3.2.1 +scipy=0.11.0 +beautiful-soup=4.2.1 +statsmodels=0.4.3 +bigquery=2.0.17 diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index fc8cb04387a55..c9beec81236fb 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -1,17 +1,17 @@ -python-dateutil==2.2 -pytz==2013b -openpyxl==1.6.2 -xlsxwriter==0.4.6 -xlrd==0.9.2 -html5lib==1.0b2 -numpy==1.8.0 -cython==0.19.1 -numexpr==2.3 -tables==3.1.0 -bottleneck==0.8.0 -matplotlib==1.2.1 -patsy==0.1.0 -lxml==3.2.1 -scipy==0.13.3 -beautifulsoup4==4.2.1 -statsmodels==0.5.0 +dateutil +pytz=2013b +openpyxl=1.6.2 +xlsxwriter=0.4.6 +xlrd=0.9.2 +html5lib=1.0b2 +numpy=1.8.0 +cython=0.19.1 +numexpr +pytables +bottleneck=0.8.0 +matplotlib +patsy +lxml=3.2.1 +scipy +beautiful-soup=4.2.1 +statsmodels diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt index 0747e6f54cd73..33d3b3b4dc459 100644 --- a/ci/requirements-3.4.txt +++ b/ci/requirements-3.4.txt @@ -1,19 +1,19 @@ -python-dateutil +dateutil pytz openpyxl xlsxwriter xlrd html5lib -numpy==1.8.0 -cython==0.20.2 -scipy==0.13.3 -numexpr==2.4 -tables==3.1.0 -bottleneck==0.8.0 -matplotlib==1.3.1 patsy -lxml==3.3.5 -sqlalchemy==0.9.6 -pymysql==0.6.1 -psycopg2==2.5.2 -beautifulsoup4 +beautiful-soup +numpy +cython +scipy +numexpr +pytables +matplotlib +lxml +sqlalchemy +bottleneck +pymysql +psycopg2 diff --git a/ci/requirements-3.4_32.txt b/ci/requirements-3.4_32.txt new file mode 100644 index 0000000000000..e9dfe9f0ee19e --- /dev/null +++ b/ci/requirements-3.4_32.txt @@ -0,0 +1,10 @@ +dateutil +pytz +openpyxl +xlrd +numpy +cython +scipy +numexpr +pytables +matplotlib diff --git a/ci/requirements-3.4_64.txt b/ci/requirements-3.4_64.txt new file mode 100644 index 0000000000000..e9dfe9f0ee19e --- /dev/null +++ b/ci/requirements-3.4_64.txt @@ -0,0 +1,10 @@ +dateutil +pytz +openpyxl +xlrd +numpy +cython +scipy +numexpr +pytables +matplotlib diff --git a/ci/run_with_env.cmd b/ci/run_with_env.cmd new file mode 100644 index 0000000000000..3a472bc836c30 --- /dev/null +++ b/ci/run_with_env.cmd @@ -0,0 +1,47 @@ +:: To build extensions for 64 bit Python 3, we need to configure environment +:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) +:: +:: To build extensions for 64 bit Python 2, we need to configure environment +:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) +:: +:: 32 bit builds do not require specific environment configurations. +:: +:: Note: this script needs to be run with the /E:ON and /V:ON flags for the +:: cmd interpreter, at least for (SDK v7.0) +:: +:: More details at: +:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows +:: http://stackoverflow.com/a/13751649/163740 +:: +:: Author: Olivier Grisel +:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ +@ECHO OFF + +SET COMMAND_TO_RUN=%* +SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows + +SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" +IF %MAJOR_PYTHON_VERSION% == "2" ( + SET WINDOWS_SDK_VERSION="v7.0" +) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( + SET WINDOWS_SDK_VERSION="v7.1" +) ELSE ( + ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" + EXIT 1 +) + +IF "%PYTHON_ARCH%"=="64" ( + ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture + SET DISTUTILS_USE_SDK=1 + SET MSSdk=1 + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) ELSE ( + ECHO Using default MSVC build environment for 32 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) diff --git a/ci/script.sh b/ci/script.sh index 152a2f1ebdcf9..b1ba7ba79c816 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -2,22 +2,23 @@ echo "inside $0" +source activate pandas + if [ -n "$LOCALE_OVERRIDE" ]; then export LC_ALL="$LOCALE_OVERRIDE"; echo "Setting LC_ALL to $LOCALE_OVERRIDE" - curdir="$(pwd)" - cd /tmp + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' python -c "$pycmd" - cd "$curdir" fi # conditionally build and upload docs to GH/pandas-docs/pandas-docs/travis "$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & # doc build log will be shown after tests -echo nosetests --exe -w /tmp -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml -nosetests --exe -w /tmp -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml + +echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml +nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml RET="$?" diff --git a/doc/make.py b/doc/make.py index 4367ac91396bb..6b424ce2814d5 100755 --- a/doc/make.py +++ b/doc/make.py @@ -31,48 +31,48 @@ SPHINX_BUILD = 'sphinxbuild' -def upload_dev(): +def upload_dev(user='pandas'): 'push a copy to the pydata dev directory' - if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'): + if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): raise SystemExit('Upload to Pydata Dev failed') -def upload_dev_pdf(): +def upload_dev_pdf(user='pandas'): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/'): + if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): raise SystemExit('PDF upload to Pydata Dev failed') -def upload_stable(): +def upload_stable(user='pandas'): 'push a copy to the pydata stable directory' - if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'): + if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): raise SystemExit('Upload to stable failed') -def upload_stable_pdf(): +def upload_stable_pdf(user='pandas'): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/'): + if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): raise SystemExit('PDF upload to stable failed') -def upload_prev(ver, doc_root='./'): +def upload_prev(ver, doc_root='./', user='pandas'): 'push a copy of older release to appropriate version directory' local_dir = doc_root + 'build/html' remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . pandas@pandas.pydata.org:%s -essh' - cmd = cmd % (local_dir, remote_dir) + cmd = 'cd %s; rsync -avz . %s@pandas.pydata.org:%s -essh' + cmd = cmd % (local_dir, user, remote_dir) print(cmd) if os.system(cmd): raise SystemExit( 'Upload to %s from %s failed' % (remote_dir, local_dir)) local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf pandas@pandas.pydata.org:%s' - pdf_cmd = pdf_cmd % (local_dir, remote_dir) + pdf_cmd = 'cd %s; scp pandas.pdf %s@pandas.pydata.org:%s' + pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) if os.system(pdf_cmd): raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) @@ -337,6 +337,10 @@ def generate_index(api=True, single=False, **kwds): type=str, default=False, help='filename of section to compile, e.g. "indexing"') +argparser.add_argument('--user', + type=str, + default=False, + help='Username to connect to the pydata server') def main(): args, unknown = argparser.parse_known_args() @@ -354,16 +358,19 @@ def main(): ver = sys.argv[2] if ftype == 'build_previous': - build_prev(ver) + build_prev(ver, user=args.user) if ftype == 'upload_previous': - upload_prev(ver) + upload_prev(ver, user=args.user) elif len(sys.argv) == 2: for arg in sys.argv[1:]: func = funcd.get(arg) if func is None: raise SystemExit('Do not know how to handle %s; valid args are %s' % ( arg, list(funcd.keys()))) - func() + if args.user: + func(user=args.user) + else: + func() else: small_docs = False all() diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 985f112979a7e..c98f41973e1ee 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -66,7 +66,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), - 'E' : 'foo' }) + 'E' : pd.Categorical(["test","train","test","train"]), + 'F' : 'foo' }) df2 Having specific :ref:`dtypes ` @@ -165,7 +166,7 @@ Selection recommend the optimized pandas data access methods, ``.at``, ``.iat``, ``.loc``, ``.iloc`` and ``.ix``. -See the :ref:`Indexing section ` and below. +See the indexing documentation :ref:`Indexing and Selecing Data ` and :ref:`MultiIndex / Advanced Indexing ` Getting ~~~~~~~ @@ -432,7 +433,12 @@ See more at :ref:`Histogramming and Discretization ` String Methods ~~~~~~~~~~~~~~ -See more at :ref:`Vectorized String Methods ` +Series is equipped with a set of string processing methods in the `str` +attribute that make it easy to operate on each element of the array, as in the +code snippet below. Note that pattern-matching in `str` generally uses `regular +expressions `__ by default (and in +some cases always uses them). See more at :ref:`Vectorized String Methods +`. .. ipython:: python @@ -528,7 +534,7 @@ the function. Reshaping --------- -See the sections on :ref:`Hierarchical Indexing ` and +See the sections on :ref:`Hierarchical Indexing ` and :ref:`Reshaping `. Stack @@ -635,6 +641,49 @@ the quarter end: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 ts.head() +Categoricals +------------ + +Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the +:ref:`categorical introduction ` and the :ref:`API documentation `. + +.. ipython:: python + + df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + +Convert the raw grades to a categorical data type. + +.. ipython:: python + + df["grade"] = df["raw_grade"].astype("category") + df["grade"] + +Rename the categories to more meaningful names (assigning to ``Series.cat.categories`` is inplace!) + +.. ipython:: python + + df["grade"].cat.categories = ["very good", "good", "very bad"] + +Reorder the categories and simultaneously add the missing categories (methods under ``Series +.cat`` return a new ``Series`` per default). + +.. ipython:: python + + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] + +Sorting is per order in the categories, not lexical order. + +.. ipython:: python + + df.sort("grade") + +Grouping by a categorical column shows also empty categories. + +.. ipython:: python + + df.groupby("grade").size() + Plotting -------- diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst new file mode 100644 index 0000000000000..1749409c863df --- /dev/null +++ b/doc/source/advanced.rst @@ -0,0 +1,709 @@ +.. _advanced: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import range, zip + +****************************** +MultiIndex / Advanced Indexing +****************************** + +This section covers indexing with a ``MultiIndex`` and more advanced indexing features. + +See the :ref:`Indexing and Selecting Data ` for general indexing documentation. + +.. warning:: + + Whether a copy or a reference is returned for a setting operation, may + depend on the context. This is sometimes called ``chained assignment`` and + should be avoided. See :ref:`Returning a View versus Copy + ` + +.. warning:: + + In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be + a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) + +See the :ref:`cookbook` for some advanced strategies + +.. _advanced.hierarchical: + +Hierarchical indexing (MultiIndex) +---------------------------------- + +Hierarchical / Multi-level indexing is very exciting as it opens the door to some +quite sophisticated data analysis and manipulation, especially for working with +higher dimensional data. In essence, it enables you to store and manipulate +data with an arbitrary number of dimensions in lower dimensional data +structures like Series (1d) and DataFrame (2d). + +In this section, we will show what exactly we mean by "hierarchical" indexing +and how it integrates with the all of the pandas indexing functionality +described above and in prior sections. Later, when discussing :ref:`group by +` and :ref:`pivoting and reshaping data `, we'll show +non-trivial applications to illustrate how it aids in structuring data for +analysis. + +See the :ref:`cookbook` for some advanced strategies + +Creating a MultiIndex (hierarchical index) object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``MultiIndex`` object is the hierarchical analogue of the standard +``Index`` object which typically stores the axis labels in pandas objects. You +can think of ``MultiIndex`` an array of tuples where each tuple is unique. A +``MultiIndex`` can be created from a list of arrays (using +``MultiIndex.from_arrays``), an array of tuples (using +``MultiIndex.from_tuples``), or a crossed set of iterables (using +``MultiIndex.from_product``). The ``Index`` constructor will attempt to return +a ``MultiIndex`` when it is passed a list of tuples. The following examples +demo different ways to initialize MultiIndexes. + + +.. ipython:: python + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = list(zip(*arrays)) + tuples + + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + index + + s = Series(randn(8), index=index) + s + +When you want every pairing of the elements in two iterables, it can be easier +to use the ``MultiIndex.from_product`` function: + +.. ipython:: python + + iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] + MultiIndex.from_product(iterables, names=['first', 'second']) + +As a convenience, you can pass a list of arrays directly into Series or +DataFrame to construct a MultiIndex automatically: + +.. ipython:: python + + arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), + np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + s = Series(randn(8), index=arrays) + s + df = DataFrame(randn(8, 4), index=arrays) + df + +All of the ``MultiIndex`` constructors accept a ``names`` argument which stores +string names for the levels themselves. If no names are provided, ``None`` will +be assigned: + +.. ipython:: python + + df.index.names + +This index can back any axis of a pandas object, and the number of **levels** +of the index is up to you: + +.. ipython:: python + + df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df + DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + +We've "sparsified" the higher levels of the indexes to make the console output a +bit easier on the eyes. + +It's worth keeping in mind that there's nothing preventing you from using +tuples as atomic labels on an axis: + +.. ipython:: python + + Series(randn(8), index=tuples) + +The reason that the ``MultiIndex`` matters is that it can allow you to do +grouping, selection, and reshaping operations as we will describe below and in +subsequent areas of the documentation. As you will see in later sections, you +can find yourself working with hierarchically-indexed data without creating a +``MultiIndex`` explicitly yourself. However, when loading data from a file, you +may wish to generate your own ``MultiIndex`` when preparing the data set. + +Note that how the index is displayed by be controlled using the +``multi_sparse`` option in ``pandas.set_printoptions``: + +.. ipython:: python + + pd.set_option('display.multi_sparse', False) + df + pd.set_option('display.multi_sparse', True) + +.. _advanced.get_level_values: + +Reconstructing the level labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The method ``get_level_values`` will return a vector of the labels for each +location at a particular level: + +.. ipython:: python + + index.get_level_values(0) + index.get_level_values('second') + + +Basic indexing on axis with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the important features of hierarchical indexing is that you can select +data by a "partial" label identifying a subgroup in the data. **Partial** +selection "drops" levels of the hierarchical index in the result in a +completely analogous way to selecting a column in a regular DataFrame: + +.. ipython:: python + + df['bar'] + df['bar', 'one'] + df['bar']['one'] + s['qux'] + +See :ref:`Cross-section with hierarchical index ` for how to select +on a deeper level. + + +Data alignment and using ``reindex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Operations between differently-indexed objects having ``MultiIndex`` on the +axes will work as you expect; data alignment will work the same as an Index of +tuples: + +.. ipython:: python + + s + s[:-2] + s + s[::2] + +``reindex`` can be called with another ``MultiIndex`` or even a list or array +of tuples: + +.. ipython:: python + + s.reindex(index[:3]) + s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + +.. _advanced.advanced_hierarchical: + +Advanced indexing with hierarchical index +----------------------------------------- + +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a +bit challenging, but we've made every effort to do so. for example the +following works as you would expect: + +.. ipython:: python + + df = df.T + df + df.loc['bar'] + df.loc['bar', 'two'] + +"Partial" slicing also works quite nicely. + +.. ipython:: python + + df.loc['baz':'foo'] + +You can slice with a 'range' of values, by providing a slice of tuples. + +.. ipython:: python + + df.loc[('baz', 'two'):('qux', 'one')] + df.loc[('baz', 'two'):'foo'] + +Passing a list of labels or tuples works similar to reindexing: + +.. ipython:: python + + df.ix[[('bar', 'two'), ('qux', 'one')]] + +.. _advanced.mi_slicers: + +Using slicers +~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....),:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....)] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + miindex = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) + dfmi + +Basic multi-index slicing using slices, lists, and labels. + +.. ipython:: python + + dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to have a more natural syntax using ``:`` rather than using ``slice(None)`` + +.. ipython:: python + + idx = pd.IndexSlice + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python + + dfmi.loc['A1',(slice(None),'foo')] + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +Using a boolean indexer you can provide selection related to the *values*. + +.. ipython:: python + + mask = dfmi[('a','foo')]>200 + dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + dfmi.loc(axis=0)[:,:,['C1','C3']] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2 + +You can use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2 + +.. _advanced.xs: + +Cross-section +~~~~~~~~~~~~~ + +The ``xs`` method of ``DataFrame`` additionally takes a level argument to make +selecting data at a particular level of a MultiIndex easier. + +.. ipython:: python + + df + df.xs('one', level='second') + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[(slice(None),'one'),:] + +You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by +providing the axis argument + +.. ipython:: python + + df = df.T + df.xs('one', level='second', axis=1) + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,(slice(None),'one')] + +:meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys + +.. ipython:: python + + df.xs(('one', 'bar'), level=('second', 'first'), axis=1) + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,('bar','one')] + +.. versionadded:: 0.13.0 + +You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain +the level that was selected + +.. ipython:: python + + df.xs('one', level='second', axis=1, drop_level=False) + +versus the result with ``drop_level=True`` (the default value) + +.. ipython:: python + + df.xs('one', level='second', axis=1, drop_level=True) + +.. ipython:: python + :suppress: + + df = df.T + +.. _advanced.advanced_reindex: + +Advanced reindexing and alignment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The parameter ``level`` has been added to the ``reindex`` and ``align`` methods +of pandas objects. This is useful to broadcast values across a level. For +instance: + +.. ipython:: python + + midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = DataFrame(randn(4,2), index=midx) + df + df2 = df.mean(level=0) + df2 + df2.reindex(df.index, level=0) + + # aligning + df_aligned, df2_aligned = df.align(df2, level=0) + df_aligned + df2_aligned + + +Swapping levels with :meth:`~pandas.MultiIndex.swaplevel` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``swaplevel`` function can switch the order of two levels: + +.. ipython:: python + + df[:5] + df[:5].swaplevel(0, 1, axis=0) + +.. _advanced.reorderlevels: + +Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``reorder_levels`` function generalizes the ``swaplevel`` function, +allowing you to permute the hierarchical index levels in one step: + +.. ipython:: python + + df[:5].reorder_levels([1,0], axis=0) + +The need for sortedness with :class:`~pandas.MultiIndex` +-------------------------------------------------------- + +**Caveat emptor**: the present implementation of ``MultiIndex`` requires that +the labels be sorted for some of the slicing / indexing routines to work +correctly. You can think about breaking the axis into unique groups, where at +the hierarchical level of interest, each distinct group shares a label, but no +two have the same label. However, the ``MultiIndex`` does not enforce this: +**you are responsible for ensuring that things are properly sorted**. There is +an important new method ``sortlevel`` to sort an axis within a ``MultiIndex`` +so that its labels are grouped and sorted by the original ordering of the +associated factor at that level. Note that this does not necessarily mean the +labels will be sorted lexicographically! + +.. ipython:: python + + import random; random.shuffle(tuples) + s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s + s.sortlevel(0) + s.sortlevel(1) + +.. _advanced.sortlevel_byname: + +Note, you may also pass a level name to ``sortlevel`` if the MultiIndex levels +are named. + +.. ipython:: python + + s.index.set_names(['L1', 'L2'], inplace=True) + s.sortlevel(level='L1') + s.sortlevel(level='L2') + +Some indexing will work even if the data are not sorted, but will be rather +inefficient and will also return a copy of the data rather than a view: + +.. ipython:: python + + s['qux'] + s.sortlevel(1)['qux'] + +On higher dimensional objects, you can sort any of the other axes by level if +they have a MultiIndex: + +.. ipython:: python + + df.T.sortlevel(1, axis=1) + +The ``MultiIndex`` object has code to **explicity check the sort depth**. Thus, +if you try to index at a depth at which the index is not sorted, it will raise +an exception. Here is a concrete example to illustrate this: + +.. ipython:: python + + tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] + idx = MultiIndex.from_tuples(tuples) + idx.lexsort_depth + + reordered = idx[[1, 0, 3, 2]] + reordered.lexsort_depth + + s = Series(randn(4), index=reordered) + s.ix['a':'a'] + +However: + +:: + + >>> s.ix[('a', 'b'):('b', 'a')] + Traceback (most recent call last) + ... + KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) + + +Take Methods +------------ + +.. _advanced.take: + +Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides +the ``take`` method that retrieves elements along a given axis at the given +indices. The given indices must be either a list or an ndarray of integer +index positions. ``take`` will also accept negative integers as relative positions to the end of the object. + +.. ipython:: python + + index = Index(randint(0, 1000, 10)) + index + + positions = [0, 9, 3] + + index[positions] + index.take(positions) + + ser = Series(randn(10)) + + ser.iloc[positions] + ser.take(positions) + +For DataFrames, the given indices should be a 1d list or ndarray that specifies +row or column positions. + +.. ipython:: python + + frm = DataFrame(randn(5, 3)) + + frm.take([1, 4, 3]) + + frm.take([0, 2], axis=1) + +It is important to note that the ``take`` method on pandas objects are not +intended to work on boolean indices and may return unexpected results. + +.. ipython:: python + + arr = randn(10) + arr.take([False, False, True, True]) + arr[[0, 1]] + + ser = Series(randn(10)) + ser.take([False, False, True, True]) + ser.ix[[0, 1]] + +Finally, as a small note on performance, because the ``take`` method handles +a narrower range of inputs, it can offer performance that is a good deal +faster than fancy indexing. + +.. ipython:: + + arr = randn(10000, 5) + indexer = np.arange(10000) + random.shuffle(indexer) + + timeit arr[indexer] + timeit arr.take(indexer, axis=0) + + ser = Series(arr[:, 0]) + timeit ser.ix[indexer] + timeit ser.take(indexer) + +.. _indexing.float64index: + +Float64Index +------------ + +.. note:: + + As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype + array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype + array. Using a ``float64`` dtype in the backend speeds up arithmetic + operations by about 30x and boolean indexing operations on the + ``Float64Index`` itself are about 2x as fast. + + +.. versionadded:: 0.13.0 + +By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation. +This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the +same. + +.. ipython:: python + + indexf = Index([1.5, 2, 3, 4.5, 5]) + indexf + sf = Series(range(5),index=indexf) + sf + +Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) + +.. ipython:: python + + sf[3] + sf[3.0] + sf.ix[3] + sf.ix[3.0] + sf.loc[3] + sf.loc[3.0] + +The only positional indexing is via ``iloc`` + +.. ipython:: python + + sf.iloc[3] + +A scalar index that is not found will raise ``KeyError`` + +Slicing is ALWAYS on the values of the index, for ``[],ix,loc`` and ALWAYS positional with ``iloc`` + +.. ipython:: python + + sf[2:4] + sf.ix[2:4] + sf.loc[2:4] + sf.iloc[2:4] + +In float indexes, slicing using floats is allowed + +.. ipython:: python + + sf[2.1:4.6] + sf.loc[2.1:4.6] + +In non-float indexes, slicing using floats will raise a ``TypeError`` + +.. code-block:: python + + In [1]: Series(range(5))[3.5] + TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) + + In [1]: Series(range(5))[3.5:4.5] + TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) + +Using a scalar float indexer will be deprecated in a future version, but is allowed for now. + +.. code-block:: python + + In [3]: Series(range(5))[3.0] + Out[3]: 3 + +Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat +irregular timedelta-like indexing scheme, but the data is recorded as floats. This could for +example be millisecond offsets. + +.. ipython:: python + + dfir = concat([DataFrame(randn(5,2), + index=np.arange(5) * 250.0, + columns=list('AB')), + DataFrame(randn(6,2), + index=np.arange(4,10) * 250.1, + columns=list('AB'))]) + dfir + +Selection operations then will always work on a value basis, for all selection operators. + +.. ipython:: python + + dfir[0:1000.4] + dfir.loc[0:1001,'A'] + dfir.loc[1000.4] + +You could then easily pick out the first 1 second (1000 ms) of data then. + +.. ipython:: python + + dfir[0:1000] + +Of course if you need integer based selection, then use ``iloc`` + +.. ipython:: python + + dfir.iloc[0:5] + diff --git a/doc/source/api.rst b/doc/source/api.rst index feb4da700354d..2e913d8aae4da 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -146,8 +146,8 @@ Top-level missing data isnull notnull -Top-level dealing with datetimes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Top-level dealing with datetimelike +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ @@ -157,6 +157,7 @@ Top-level dealing with datetimes date_range bdate_range period_range + timedelta_range Top-level evaluation ~~~~~~~~~~~~~~~~~~~~ @@ -414,6 +415,7 @@ Reshaping, sorting Series.sortlevel Series.swaplevel Series.unstack + Series.searchsorted Combining / joining / merging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -440,13 +442,16 @@ Time series-related Datetimelike Properties ~~~~~~~~~~~~~~~~~~~~~~~ + ``Series.dt`` can be used to access the values of the series as datetimelike and return several properties. Due to implementation details the methods show up here as methods of the -``DatetimeProperties/PeriodProperties`` classes. These can be accessed like ``Series.dt.``. +``DatetimeProperties/PeriodProperties/TimedeltaProperties`` classes. These can be accessed like ``Series.dt.``. .. currentmodule:: pandas.tseries.common +**Datetime Properties** + .. autosummary:: :toctree: generated/ @@ -473,6 +478,37 @@ Due to implementation details the methods show up here as methods of the DatetimeProperties.is_year_start DatetimeProperties.is_year_end +**Datetime Methods** + +.. autosummary:: + :toctree: generated/ + + DatetimeProperties.to_period + DatetimeProperties.to_pydatetime + DatetimeProperties.tz_localize + DatetimeProperties.tz_convert + +**Timedelta Properties** + +.. autosummary:: + :toctree: generated/ + + TimedeltaProperties.days + TimedeltaProperties.hours + TimedeltaProperties.minutes + TimedeltaProperties.seconds + TimedeltaProperties.milliseconds + TimedeltaProperties.microseconds + TimedeltaProperties.nanoseconds + TimedeltaProperties.components + +**Timedelta Methods** + +.. autosummary:: + :toctree: generated/ + + TimedeltaProperties.to_pytimedelta + String handling ~~~~~~~~~~~~~~~ ``Series.str`` can be used to access the values of the series as @@ -520,52 +556,41 @@ Categorical .. currentmodule:: pandas.core.categorical -If the Series is of dtype ``category``, ``Series.cat`` can be used to access the the underlying -``Categorical``. This data type is similar to the otherwise underlying numpy array -and has the following usable methods and properties (all available as -``Series.cat.``). - +If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the +following usable methods and properties (all available as ``Series.cat.``). .. autosummary:: :toctree: generated/ - Categorical - Categorical.from_codes - Categorical.levels + Categorical.categories Categorical.ordered - Categorical.reorder_levels - Categorical.remove_unused_levels - Categorical.min - Categorical.max - Categorical.mode - Categorical.describe + Categorical.rename_categories + Categorical.reorder_categories + Categorical.add_categories + Categorical.remove_categories + Categorical.remove_unused_categories + Categorical.set_categories + Categorical.codes -``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts -the Categorical back to a numpy array, so levels and order information is not preserved! +To create a Series of dtype ``category``, use ``cat = s.astype("category")``. + +The following two ``Categorical`` constructors are considered API but should only be used when +adding ordering information or special categories is need at creation time of the categorical data: .. autosummary:: :toctree: generated/ - Categorical.__array__ + Categorical + Categorical.from_codes -To create compatibility with `pandas.Series` and `numpy` arrays, the following (non-API) methods -are also introduced. +``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts +the Categorical back to a numpy array, so levels and order information is not preserved! .. autosummary:: :toctree: generated/ - Categorical.from_array - Categorical.get_values - Categorical.copy - Categorical.dtype - Categorical.ndim - Categorical.sort - Categorical.equals - Categorical.unique - Categorical.order - Categorical.argsort - Categorical.fillna - + Categorical.__array__ Plotting ~~~~~~~~ @@ -794,7 +819,6 @@ Reshaping, sorting, transposing .. autosummary:: :toctree: generated/ - DataFrame.delevel DataFrame.pivot DataFrame.reorder_levels DataFrame.sort @@ -1307,6 +1331,37 @@ Conversion DatetimeIndex.to_pydatetime DatetimeIndex.to_series +TimedeltaIndex +-------------- + +.. autosummary:: + :toctree: generated/ + + TimedeltaIndex + +Components +~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + TimedeltaIndex.days + TimedeltaIndex.hours + TimedeltaIndex.minutes + TimedeltaIndex.seconds + TimedeltaIndex.milliseconds + TimedeltaIndex.microseconds + TimedeltaIndex.nanoseconds + TimedeltaIndex.components + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + TimedeltaIndex.to_pytimedelta + TimedeltaIndex.to_series + GroupBy ------- .. currentmodule:: pandas.core.groupby @@ -1346,12 +1401,80 @@ Computations / Descriptive Stats .. autosummary:: :toctree: generated/ + GroupBy.count + GroupBy.cumcount + GroupBy.first + GroupBy.head + GroupBy.last + GroupBy.max GroupBy.mean GroupBy.median + GroupBy.min + GroupBy.nth + GroupBy.ohlc + GroupBy.prod + GroupBy.size GroupBy.sem GroupBy.std + GroupBy.sum GroupBy.var - GroupBy.ohlc + GroupBy.tail + +The following methods are available in both ``SeriesGroupBy`` and +``DataFrameGroupBy`` objects, but may differ slightly, usually in that +the ``DataFrameGroupBy`` version usually permits the specification of an +axis argument, and often an argument indicating whether to restrict +application to columns of a specific data type. + +.. autosummary:: + :toctree: generated/ + + DataFrameGroupBy.bfill + DataFrameGroupBy.cummax + DataFrameGroupBy.cummin + DataFrameGroupBy.cumprod + DataFrameGroupBy.cumsum + DataFrameGroupBy.describe + DataFrameGroupBy.all + DataFrameGroupBy.any + DataFrameGroupBy.corr + DataFrameGroupBy.cov + DataFrameGroupBy.diff + DataFrameGroupBy.ffill + DataFrameGroupBy.fillna + DataFrameGroupBy.hist + DataFrameGroupBy.idxmax + DataFrameGroupBy.idxmin + DataFrameGroupBy.irow + DataFrameGroupBy.mad + DataFrameGroupBy.pct_change + DataFrameGroupBy.plot + DataFrameGroupBy.quantile + DataFrameGroupBy.rank + DataFrameGroupBy.resample + DataFrameGroupBy.shift + DataFrameGroupBy.skew + DataFrameGroupBy.take + DataFrameGroupBy.tshift + +The following methods are available only for ``SeriesGroupBy`` objects. + +.. autosummary:: + :toctree: generated/ + + SeriesGroupBy.nlargest + SeriesGroupBy.nsmallest + SeriesGroupBy.nunique + SeriesGroupBy.unique + SeriesGroupBy.value_counts + +The following methods are available only for ``DataFrameGroupBy`` objects. + +.. autosummary:: + :toctree: generated/ + + DataFrameGroupBy.corrwith + DataFrameGroupBy.boxplot .. currentmodule:: pandas diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e880bb2d6b952..7ee82cd69a257 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -317,6 +317,16 @@ locations treated as equal. (df+df).equals(df*2) +Note that the Series or DataFrame index needs to be in the same order for +equality to be True: + +.. ipython:: python + + df1 = DataFrame({'col':['foo', 0, np.nan]}) + df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df1.equals(df2) + df1.equals(df2.sort()) + Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -410,7 +420,7 @@ values: Here is a quick reference summary table of common functions. Each also takes an optional ``level`` parameter which applies only if the object has a -:ref:`hierarchical index`. +:ref:`hierarchical index`. .. csv-table:: :header: "Function", "Description" @@ -490,9 +500,24 @@ number of unique values and most frequently occurring values: s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) s.describe() +Note that on a mixed-type DataFrame object, `describe` will restrict the summary to +include only numerical columns or, if none are, only categorical columns: + +.. ipython:: python + + frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame.describe() + +This behaviour can be controlled by providing a list of types as ``include``/``exclude`` +arguments. The special value ``all`` can also be used: + +.. ipython:: python + + frame.describe(include=['object']) + frame.describe(include=['number']) + frame.describe(include='all') -There also is a utility function, ``value_range`` which takes a DataFrame and -returns a series with the minimum/maximum values in the DataFrame. +That feature relies on :ref:`select_dtypes `. Refer to there for details about accepted inputs. .. _basics.idxmin: @@ -822,7 +847,7 @@ DataFrame's index. .. seealso:: - :ref:`Advanced indexing ` is an even more concise way of + :ref:`MultiIndex / Advanced Indexing ` is an even more concise way of doing reindexing. .. note:: @@ -1122,184 +1147,64 @@ This enables nice expressions like this: s[s.dt.day==2] -.. ipython:: python - - # period - s = Series(period_range('20130101',periods=4,freq='D').asobject) - s - s.dt.year - s.dt.day - -.. note:: - - ``Series.dt`` will raise a ``TypeError`` if you access with a non-datetimelike values - -.. _basics.string_methods: - -Vectorized string methods -------------------------- - -Series is equipped (as of pandas 0.8.1) with a set of string processing methods -that make it easy to operate on each element of the array. Perhaps most -importantly, these methods exclude missing/NA values automatically. These are -accessed via the Series's ``str`` attribute and generally have names matching -the equivalent (scalar) build-in string methods: - -Splitting and Replacing Strings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. ipython:: python - - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) - s.str.lower() - s.str.upper() - s.str.len() - -Methods like ``split`` return a Series of lists: - -.. ipython:: python - - s2 = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) - s2.str.split('_') - -Elements in the split lists can be accessed using ``get`` or ``[]`` notation: +You can easily produces tz aware transformations: .. ipython:: python - s2.str.split('_').str.get(1) - s2.str.split('_').str[1] + stz = s.dt.tz_localize('US/Eastern') + stz + stz.dt.tz -Methods like ``replace`` and ``findall`` take regular expressions, too: +You can also chain these types of operations: .. ipython:: python - s3 = Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat']) - s3 - s3.str.replace('^.a|dog', 'XX-XX ', case=False) + s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') -Extracting Substrings -~~~~~~~~~~~~~~~~~~~~~ - -The method ``extract`` (introduced in version 0.13) accepts regular expressions -with match groups. Extracting a regular expression with one group returns -a Series of strings. +The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python - Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') - -Elements that do not match return ``NaN``. Extracting a regular expression -with more than one group returns a DataFrame with one column per group. - -.. ipython:: python - - Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') - -Elements that do not match return a row filled with ``NaN``. -Thus, a Series of messy strings can be "converted" into a -like-indexed Series or DataFrame of cleaned-up or more useful strings, -without necessitating ``get()`` to access tuples or ``re.match`` objects. - -The results dtype always is object, even if no match is found and the result -only contains ``NaN``. - -Named groups like - -.. ipython:: python - - Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') - -and optional groups like - -.. ipython:: python - - Series(['a1', 'b2', '3']).str.extract('(?P[ab])?(?P\d)') - -can also be used. - -Testing for Strings that Match or Contain a Pattern -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can check whether elements contain a pattern: - -.. ipython:: python - - pattern = r'[a-z][0-9]' - Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern) - -or match a pattern: - + # period + s = Series(period_range('20130101',periods=4,freq='D')) + s + s.dt.year + s.dt.day .. ipython:: python - Series(['1', '2', '3a', '3b', '03c']).str.match(pattern, as_indexer=True) - -The distinction between ``match`` and ``contains`` is strictness: ``match`` -relies on strict ``re.match``, while ``contains`` relies on ``re.search``. + # timedelta + s = Series(timedelta_range('1 day 00:00:05',periods=4,freq='s')) + s + s.dt.days + s.dt.seconds + s.dt.components -.. warning:: +.. note:: - In previous versions, ``match`` was for *extracting* groups, - returning a not-so-convenient Series of tuples. The new method ``extract`` - (described in the previous section) is now preferred. + ``Series.dt`` will raise a ``TypeError`` if you access with a non-datetimelike values - This old, deprecated behavior of ``match`` is still the default. As - demonstrated above, use the new behavior by setting ``as_indexer=True``. - In this mode, ``match`` is analogous to ``contains``, returning a boolean - Series. The new behavior will become the default behavior in a future - release. +Vectorized string methods +------------------------- -Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take - an extra ``na`` argument so missing values can be considered True or False: +Series is equipped with a set of string processing methods that make it easy to +operate on each element of the array. Perhaps most importantly, these methods +exclude missing/NA values automatically. These are accessed via the Series's +``str`` attribute and generally have names matching the equivalent (scalar) +built-in string methods. For example: -.. ipython:: python + .. ipython:: python - s4 = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) - s4.str.contains('A', na=False) + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() -.. csv-table:: - :header: "Method", "Description" - :widths: 20, 80 +Powerful pattern-matching methods are provided as well, but note that +pattern-matching generally uses `regular expressions +`__ by default (and in some cases +always uses them). - ``cat``,Concatenate strings - ``split``,Split strings on delimiter - ``get``,Index into each element (retrieve i-th element) - ``join``,Join strings in each element of the Series with passed separator - ``contains``,Return boolean array if each string contains pattern/regex - ``replace``,Replace occurrences of pattern/regex with some other string - ``repeat``,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) - ``pad``,"Add whitespace to left, right, or both sides of strings" - ``center``,Equivalent to ``pad(side='both')`` - ``wrap``,Split long strings into lines with length less than a given width - ``slice``,Slice each string in the Series - ``slice_replace``,Replace slice in each string with passed value - ``count``,Count occurrences of pattern - ``startswith``,Equivalent to ``str.startswith(pat)`` for each element - ``endswith``,Equivalent to ``str.endswith(pat)`` for each element - ``findall``,Compute list of all occurrences of pattern/regex for each string - ``match``,"Call ``re.match`` on each element, returning matched groups as list" - ``extract``,"Call ``re.match`` on each element, as ``match`` does, but return matched groups as strings for convenience." - ``len``,Compute string lengths - ``strip``,Equivalent to ``str.strip`` - ``rstrip``,Equivalent to ``str.rstrip`` - ``lstrip``,Equivalent to ``str.lstrip`` - ``lower``,Equivalent to ``str.lower`` - ``upper``,Equivalent to ``str.upper`` - - -Getting indicator variables from separated strings -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can extract dummy variables from string columns. -For example if they are separated by a ``'|'``: - - .. ipython:: python - - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) - s.str.get_dummies(sep='|') - -See also :func:`~pandas.get_dummies`. +Please see :ref:`Vectorized String Methods ` for a complete +description. .. _basics.sorting: @@ -1350,6 +1255,19 @@ argument: compatibility with NumPy methods which expect the ``ndarray.sort`` behavior. ``Series.order`` returns a copy of the sorted data. +Series has the ``searchsorted`` method, which works similar to +``np.ndarray.searchsorted``. + +.. ipython:: python + + ser = Series([1, 2, 3]) + ser.searchsorted([0, 3]) + ser.searchsorted([0, 4]) + ser.searchsorted([1, 3], side='right') + ser.searchsorted([1, 3], side='left') + ser = Series([3, 1, 2]) + ser.searchsorted([0, 3], sorter=np.argsort(ser)) + .. _basics.nsorted: smallest / largest values diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c08351eb87a79..3ee660bb85691 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -10,73 +10,70 @@ import os np.random.seed(123456) from pandas import options + from pandas import * import pandas as pd np.set_printoptions(precision=4, suppress=True) options.display.mpl_style='default' options.display.max_rows=15 -*********** -Categorical -*********** +**************** +Categorical Data +**************** .. versionadded:: 0.15 .. note:: While there was in `pandas.Categorical` in earlier versions, the ability to use - `Categorical` data in `Series` and `DataFrame` is new. + categorical data in `Series` and `DataFrame` is new. -This is a introduction to pandas :class:`pandas.Categorical` type, including a short comparison -with R's `factor`. +This is a introduction to pandas categorical data type, including a short comparison +with R's ``factor``. `Categoricals` are a pandas data type, which correspond to categorical variables in statistics: a variable, which can take on only a limited, and usually fixed, -number of possible values (commonly called `levels`). Examples are gender, social class, +number of possible values (`categories`; `levels` in R). Examples are gender, social class, blood types, country affiliations, observation time or ratings via Likert scales. -In contrast to statistical categorical variables, a `Categorical` might have an order (e.g. +In contrast to statistical categorical variables, categorical data might have an order (e.g. 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, ...) are not possible. -All values of the `Categorical` are either in `levels` or `np.nan`. Order is defined by -the order of the `levels`, not lexical order of the values. Internally, the data structure -consists of a levels array and an integer array of `codes` which point to the real value in the -levels array. +All values of categorical data are either in `categories` or `np.nan`. Order is defined by +the order of `categories`, not lexical order of the values. Internally, the data structure +consists of a `categories` array and an integer array of `codes` which point to the real value in +the `categories` array. -`Categoricals` are useful in the following cases: +The categorical data type is useful in the following cases: * A string variable consisting of only a few different values. Converting such a string - variable to a categorical variable will save some memory. + variable to a categorical variable will save some memory, see :ref:`here `. * The lexical order of a variable is not the same as the logical order ("one", "two", "three"). - By converting to a categorical and specifying an order on the levels, sorting and + By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order. * As a signal to other python libraries that this column should be treated as a categorical - variable (e.g. to use suitable statistical methods or plot types) + variable (e.g. to use suitable statistical methods or plot types). -See also the :ref:`API docs on Categoricals`. +See also the :ref:`API docs on categoricals`. Object Creation --------------- -Categorical `Series` or columns in a `DataFrame` can be crated in several ways: +Categorical `Series` or columns in a `DataFrame` can be created in several ways: -By passing a `Categorical` object to a `Series` or assigning it to a `DataFrame`: +By specifying ``dtype="category"`` when constructing a `Series`: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - s = pd.Series(raw_cat) + s = Series(["a","b","c","a"], dtype="category") s - df = pd.DataFrame({"A":["a","b","c","a"]}) - df["B"] = raw_cat - df -By converting an existing `Series` or column to a ``category`` type: +By converting an existing `Series` or column to a ``category`` dtype: .. ipython:: python - df = pd.DataFrame({"A":["a","b","c","a"]}) + df = DataFrame({"A":["a","b","c","a"]}) df["B"] = df["A"].astype('category') df @@ -84,14 +81,29 @@ By using some special functions: .. ipython:: python - df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + df = DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) +See :ref:`documentation ` for :func:`~pandas.cut`. + +By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +This is the only possibility to specify differently ordered categories (or no order at all) at +creation time and the only reason to use :class:`pandas.Categorical` directly: + +.. ipython:: python + + raw_cat = Categorical(["a","b","c","a"], categories=["b","c","d"], + ordered=False) + s = Series(raw_cat) + s + df = DataFrame({"A":["a","b","c","a"]}) + df["B"] = raw_cat + df -`Categoricals` have a specific ``category`` :ref:`dtype `: +Categorical data has a specific ``category`` :ref:`dtype `: .. ipython:: python @@ -99,283 +111,300 @@ By using some special functions: .. note:: - In contrast to R's `factor` function, a `Categorical` is not converting input values to - string and levels will end up the same data type as the original values. + In contrast to R's `factor` function, categorical data is not converting input values to + strings and categories will end up the same data type as the original values. .. note:: In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `levels` to change the levels after creation time. + creation time. Use `categories` to change the categories after creation time. To get back to the original Series or `numpy` array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: .. ipython:: python - s = pd.Series(["a","b","c","a"]) + s = Series(["a","b","c","a"]) s s2 = s.astype('category') s2 s3 = s2.astype('string') s3 - np.asarray(s2.cat) + np.asarray(s2) -If you have already `codes` and `levels`, you can use the :func:`~pandas.Categorical.from_codes` +If you have already `codes` and `categories`, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step during normal constructor mode: .. ipython:: python splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - pd.Categorical.from_codes(splitter, levels=["train", "test"]) + s = Series(Categorical.from_codes(splitter, categories=["train", "test"])) Description ----------- -Using ``.describe()`` on a ``Categorical(...)`` or a ``Series(Categorical(...))`` will show -different output. - - -As part of a `Dataframe` or as a `Series` a similar output as for a `Series` of type ``string`` is -shown. Calling ``Categorical.describe()`` will show the frequencies for each level, with NA for -unused levels. +Using ``.describe()`` on categorical data will produce similar output to a `Series` or +`DataFrame` of type ``string``. .. ipython:: python - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) - df = pd.DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) + cat = Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) + df = DataFrame({"cat":cat, "s":["a","c","c",np.nan]}) df.describe() - cat.describe() + df["cat"].describe() -Working with levels -------------------- +Working with categories +----------------------- -`Categoricals` have a `levels` property, which list their possible values. If you don't -manually specify levels, they are inferred from the passed in values. `Series` of type -``category`` expose the same interface via their `cat` property. +Categorical data has a `categories` and a `ordered` property, which list their possible values and +whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and +``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the +passed in values. .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"]) - raw_cat.levels - raw_cat.ordered - # Series of type "category" also expose these interface via the .cat property: - s = pd.Series(raw_cat) - s.cat.levels + s = Series(["a","b","c","a"], dtype="category") + s.cat.categories s.cat.ordered -.. note:: - New `Categorical` are automatically ordered if the passed in values are sortable or a - `levels` argument is supplied. This is a difference to R's `factors`, which are unordered - unless explicitly told to be ordered (``ordered=TRUE``). - -It's also possible to pass in the levels in a specific order: +It's also possible to pass in the categories in a specific order: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","a"], levels=["c","b","a"]) - s = pd.Series(raw_cat) - s.cat.levels + s = Series(Categorical(["a","b","c","a"], categories=["c","b","a"])) + s.cat.categories s.cat.ordered .. note:: - - Passing in a `levels` argument implies ``ordered=True``. You can of course overwrite that by + New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). You can of course overwrite that by passing in an explicit ``ordered=False``. -Any value omitted in the levels argument will be replaced by `np.nan`: -.. ipython:: python - - raw_cat = pd.Categorical(["a","b","c","a"], levels=["a","b"]) - s = pd.Series(raw_cat) - s.cat.levels - s +Renaming categories +~~~~~~~~~~~~~~~~~~~ -Renaming levels is done by assigning new values to the ``Category.levels`` or -``Series.cat.levels`` property: +Renaming categories is done by assigning new values to the ``Series.cat.categories`` property or +by using the :func:`Categorical.rename_categories` method: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"])) - s - s.cat.levels = ["Group %s" % g for g in s.cat.levels] + s = Series(["a","b","c","a"], dtype="category") s - s.cat.levels = [1,2,3] + s.cat.categories = ["Group %s" % g for g in s.cat.categories] s + s.cat.rename_categories([1,2,3]) + +.. note:: + + In contrast to R's `factor`, categorical data can have categories of other types than string. .. note:: - I contrast to R's `factor`, a `Categorical` can have levels of other types than string. + Be aware that assigning new categories is an inplace operations, while most other operation + under ``Series.cat`` per default return a new Series of dtype `category`. -Levels must be unique or a `ValueError` is raised: +Categories must be unique or a `ValueError` is raised: .. ipython:: python try: - s.cat.levels = [1,1,1] + s.cat.categories = [1,1,1] except ValueError as e: print("ValueError: " + str(e)) -Appending levels can be done by assigning a levels list longer than the current levels: +Appending new categories +~~~~~~~~~~~~~~~~~~~~~~~~ + +Appending categories can be done by using the :func:`Categorical.add_categories` method: .. ipython:: python - s.cat.levels = [1,2,3,4] - s.cat.levels + s = s.cat.add_categories([4]) + s.cat.categories s -.. note:: - Adding levels in other positions can be done with ``.reorder_levels()``. +Removing categories +~~~~~~~~~~~~~~~~~~~ -Removing a level is also possible, but only the last level(s) can be removed by assigning a -shorter list than current levels. Values which are omitted are replaced by ``np.nan``. +Removing categories can be done by using the :func:`Categorical.remove_categories` method. Values +which are removed are replaced by ``np.nan``.: .. ipython:: python - s.cat.levels = [1,2] + s = s.cat.remove_categories([4]) s -.. note:: - - It's only possible to remove or add a level at the last position. If that's not where you want - to remove an old or add a new level, use ``Category.reorder_levels(new_order)`` or - ``Series.cat.reorder_levels(new_order)`` methods before or after. +Renaming unused categories +~~~~~~~~~~~~~~~~~~~~~~~~~~ -Removing unused levels can also be done: +Removing unused categories can also be done: .. ipython:: python - raw = pd.Categorical(["a","b","a"], levels=["a","b","c","d"]) - c = pd.Series(raw) - raw - raw.remove_unused_levels() - raw - c.cat.remove_unused_levels() - c + s = Series(Categorical(["a","b","a"], categories=["a","b","c","d"])) + s + s.cat.remove_unused_categories() -.. note:: +Setting categories +~~~~~~~~~~~~~~~~~~ - In contrast to R's `factor` function, passing a `Categorical` as the sole input to the - `Categorical` constructor will *not* remove unused levels but create a new `Categorical` - which is equal to the passed in one! +If you want to do remove and add new categories in one step (which has some speed advantage), +or simply set the categories to a predefined scale, use :func:`Categorical.set_categories`. +.. ipython:: python + + s = Series(["one","two","four", "-"], dtype="category") + s + s = s.cat.set_categories(["one","two","three","four"]) + s + +.. note:: + Be aware that :func:`Categorical.set_categories` cannot know whether some category is omitted + intentionally or because it is misspelled or (under Python3) due to a type difference (e.g., + numpys S1 dtype and python strings). This can result in surprising behaviour! Ordered or not... ----------------- -If a `Categoricals` is ordered (``cat.ordered == True``), then the order of the levels has a +If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is raised. .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) + s = Series(Categorical(["a","b","c","a"], ordered=False)) try: s.sort() except TypeError as e: print("TypeError: " + str(e)) - s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=True)) + s = Series(["a","b","c","a"], dtype="category") # ordered per default! s.sort() s - print(s.min(), s.max()) + s.min(), s.max() -.. note:: - ``ordered=True`` is not necessary needed in the second case, as lists of strings are sortable - and so the resulting `Categorical` is ordered. - -Sorting will use the order defined by levels, not any lexical order present on the data type. +Sorting will use the order defined by categories, not any lexical order present on the data type. This is even true for strings and numeric data: .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,1])) - s.cat.levels = [2,3,1] + s = Series([1,2,3,1], dtype="category") + s.cat.categories = [2,3,1] s s.sort() s - print(s.min(), s.max()) + s.min(), s.max() -Reordering the levels is possible via the ``Categorical.reorder_levels(new_levels)`` or -``Series.cat.reorder_levels(new_levels)`` methods. All old levels must be included in the new -levels. +Reordering the categories is possible via the :func:`Categorical.reorder_categories` and +the :func:`Categorical.set_categories` methods. For :func:`Categorical.reorder_categories`, all +old categories must be included in the new categories and no new categories are allowed. .. ipython:: python - s2 = pd.Series(pd.Categorical([1,2,3,1])) - s2.cat.reorder_levels([2,3,1]) - s2 - s2.sort() - s2 - print(s2.min(), s2.max()) - + s = Series([1,2,3,1], dtype="category") + s = s.cat.reorder_categories([2,3,1]) + s + s.sort() + s + s.min(), s.max() .. note:: - Note the difference between assigning new level names and reordering the levels: the first - renames levels and therefore the individual values in the `Series`, but if the first + + Note the difference between assigning new categories and reordering the categories: the first + renames categories and therefore the individual values in the `Series`, but if the first position was sorted last, the renamed value will still be sorted last. Reordering means that the way values are sorted is different afterwards, but not that individual values in the `Series` are changed. -You can also add new levels with :func:`Categorical.reorder_levels`, as long as you include all -old levels: +.. note:: -.. ipython:: python + If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will raise + `TypeError`. Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them + (e.g.``Series.median()``, which would need to compute the mean between two values if the length + of an array is even) do not work and raise a `TypeError`. - s3 = pd.Series(pd.Categorical(["a","b","d"])) - s3.cat.reorder_levels(["a","b","c","d"]) - s3 +Comparisons +----------- -Operations ----------- +Comparing `Categoricals` with other objects is possible in two cases: -The following operations are possible with categorical data: + * comparing a categorical Series to another categorical Series, when `categories` and `ordered` is + the same or + * comparing a categorical Series to a scalar. -Getting the minimum and maximum, if the categorical is ordered: +All other comparisons will raise a TypeError. .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","a"], levels=["c","a","b","d"])) - print(s.min(), s.max()) + cat = Series(Categorical([1,2,3], categories=[3,2,1])) + cat_base = Series(Categorical([2,2,2], categories=[3,2,1])) + cat_base2 = Series(Categorical([2,2,2])) -.. note:: + cat + cat_base + cat_base2 + +Comparing to a categorical with the same categories and ordering or to a scalar works: - If the `Categorical` is not ordered, ``Categorical.min()`` and ``Categorical.max()`` and the - corresponding operations on `Series` will raise `TypeError`. +.. ipython:: python + + cat > cat_base + cat > 2 -The mode: +This doesn't work because the categories are not the same: .. ipython:: python - raw_cat = pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"]) - s = pd.Series(raw_cat) - raw_cat.mode() - s.mode() + try: + cat > cat_base2 + except TypeError as e: + print("TypeError: " + str(e)) .. note:: - Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them (e.g. - ``.median()``, which would need to compute the mean between two values if the length of an - array is even) do not work and raise a `TypeError`. + Comparisons with `Series`, `np.array` or a `Categorical` with different categories or ordering + will raise an `TypeError` because custom categories ordering could be interpreted in two ways: + one with taking in account the ordering and one without. If you want to compare a categorical + series with such a type, you need to be explicit and convert the categorical data back to the + original values: + +.. ipython:: python + + base = np.array([1,2,3]) + + try: + cat > base + except TypeError as e: + print("TypeError: " + str(e)) -`Series` methods like `Series.value_counts()` will use all levels, even if some levels are not + np.asarray(cat) > base + +Operations +---------- + +Apart from ``Series.min()``, ``Series.max()`` and ``Series.mode()``, the following operations are +possible with categorical data: + +`Series` methods like `Series.value_counts()` will use all categories, even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"])) + s = Series(Categorical(["a","b","c","c"], categories=["c","a","b","d"])) s.value_counts() -Groupby will also show "unused" levels: +Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c","d"]) - df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"]) + df = DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) df.groupby("cats").mean() - cats2 = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) - df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) + cats2 = Categorical(["a","a","b","b"], categories=["a","b","c"]) + df2 = DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) df2.groupby(["cats","B"]).mean() @@ -383,8 +412,8 @@ Pivot tables: .. ipython:: python - raw_cat = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) - df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + raw_cat = Categorical(["a","a","b","b"], categories=["a","b","c"]) + df = DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) pd.pivot_table(df, values='values', index=['A', 'B']) Data munging @@ -392,7 +421,7 @@ Data munging The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, work as normal, the only difference is the return type (for getting) and -that only values already in the levels can be assigned. +that only values already in `categories` can be assigned. Getting ~~~~~~~ @@ -402,17 +431,17 @@ the ``category`` dtype is preserved. .. ipython:: python - cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) - idx = pd.Index(["h","i","j","k","l","m","n",]) + idx = Index(["h","i","j","k","l","m","n",]) + cats = Series(["a","b","b","b","c","c","c"], dtype="category", index=idx) values= [1,2,2,2,3,4,5] - df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + df = DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] df.iloc[2:4,:].dtypes df.loc["h":"j","cats"] df.ix["h":"j",0:1] df[df["cats"] == "b"] -An example where the `Categorical` is not preserved is if you take one single row: the +An example where the category type is not preserved is if you take one single row: the resulting `Series` is of dtype ``object``: .. ipython:: python @@ -420,20 +449,20 @@ resulting `Series` is of dtype ``object``: # get the complete "h" row as a Series df.loc["h", :] -Returning a single item from a `Categorical` will also return the value, not a `Categorical` +Returning a single item from categorical data will also return the value, not a categorical of length "1". .. ipython:: python df.iat[0,0] - df["cats"].cat.levels = ["x","y","z"] + df["cats"].cat.categories = ["x","y","z"] df.at["h","cats"] # returns a string .. note:: This is a difference to R's `factor` function, where ``factor(c(1,2,3))[1]`` returns a single value `factor`. -To get a single value `Series` of type ``category`` pass in a single value list: +To get a single value `Series` of type ``category`` pass in a list with a single value: .. ipython:: python @@ -443,14 +472,14 @@ Setting ~~~~~~~ Setting values in a categorical column (or `Series`) works as long as the value is included in the -`levels`: +`categories`: .. ipython:: python - cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) - idx = pd.Index(["h","i","j","k","l","m","n"]) + idx = Index(["h","i","j","k","l","m","n"]) + cats = Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) values = [1,1,1,1,1,1,1] - df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + df = DataFrame({"cats":cats,"values":values}, index=idx) df.iloc[2:4,:] = [["b",2],["b",2]] df @@ -459,14 +488,14 @@ Setting values in a categorical column (or `Series`) works as long as the value except ValueError as e: print("ValueError: " + str(e)) -Setting values by assigning a `Categorical` will also check that the `levels` match: +Setting values by assigning categorical data will also check that the `categories` match: .. ipython:: python - df.loc["j":"k","cats"] = pd.Categorical(["a","a"], levels=["a","b"]) + df.loc["j":"k","cats"] = Categorical(["a","a"], categories=["a","b"]) df try: - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = Categorical(["b","b"], categories=["a","b","c"]) except ValueError as e: print("ValueError: " + str(e)) @@ -474,9 +503,9 @@ Assigning a `Categorical` to parts of a column of other types will use the value .. ipython:: python - df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df = DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + df.loc[1:2,"a"] = Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = Categorical(["b","b"], categories=["a","b"]) df df.dtypes @@ -485,49 +514,53 @@ Merging ~~~~~~~ You can concat two `DataFrames` containing categorical data together, -but the levels of these `Categoricals` need to be the same: +but the categories of these categoricals need to be the same: .. ipython:: python - cat = pd.Categorical(["a","b"], levels=["a","b"]) - vals = [1,2] - df = pd.DataFrame({"cats":cat, "vals":vals}) - res = pd.concat([df,df]) - res - res.dtypes + cat = Series(["a","b"], dtype="category") + vals = [1,2] + df = DataFrame({"cats":cat, "vals":vals}) + res = pd.concat([df,df]) + res + res.dtypes + +In this case the categories are not the same and so an error is raised: - df_different = df.copy() - df_different["cats"].cat.levels = ["a","b","c"] +.. ipython:: python - try: - pd.concat([df,df]) - except ValueError as e: - print("ValueError: " + str(e)) + df_different = df.copy() + df_different["cats"].cat.categories = ["c","d"] + try: + pd.concat([df,df_different]) + except ValueError as e: + print("ValueError: " + str(e)) -The same applies to ``df.append(df)``. +The same applies to ``df.append(df_different)``. Getting Data In/Out ------------------- -Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently raise ``NotImplementedError``. +Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently +raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the -`Categorical` (levels and ordering). So if you read back the CSV file you have to convert the -relevant columns back to `category` and assign the right levels and level ordering. +categorical (categories and ordering). So if you read back the CSV file you have to convert the +relevant columns back to `category` and assign the right categories and categories ordering. .. ipython:: python - :suppress: + :suppress: from pandas.compat import StringIO .. ipython:: python - s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) - # rename the levels - s.cat.levels = ["very good", "good", "bad"] - # reorder the levels and add missing levels - s.cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) - df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + # rename the categories + s.cat.categories = ["very good", "good", "bad"] + # reorder the categories and add missing categories + s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) + df = DataFrame({"cats":s, "vals":[1,2,3,4,5,6]}) csv = StringIO() df.to_csv(csv) df2 = pd.read_csv(StringIO(csv.getvalue())) @@ -535,7 +568,8 @@ relevant columns back to `category` and assign the right levels and level orderi df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + df2["cats"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"], + inplace=True) df2.dtypes df2["cats"] @@ -547,185 +581,227 @@ pandas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section ` -There are two ways a `np.nan` can be represented in `Categorical`: either the value is not -available or `np.nan` is a valid level. +There are two ways a `np.nan` can be represented in categorical data: either the value is not +available ("missing value") or `np.nan` is a valid category. .. ipython:: python - s = pd.Series(pd.Categorical(["a","b",np.nan,"a"])) + s = Series(["a","b",np.nan,"a"], dtype="category") + # only two categories s - # only two levels - s.cat.levels - s2 = pd.Series(pd.Categorical(["a","b","c","a"])) - s2.cat.levels = [1,2,np.nan] + s2 = Series(["a","b","c","a"], dtype="category") + s2.cat.categories = [1,2,np.nan] + # three categories, np.nan included s2 - # three levels, np.nan included - # Note: as int arrays can't hold NaN the levels were converted to float - s2.cat.levels + +.. note:: + As integer `Series` can't include NaN, the categories were converted to `object`. + +.. note:: + Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as + `np.nan` categories into account: + +.. ipython:: python + + c = Series(["a","b",np.nan], dtype="category") + c.cat.set_categories(["a","b",np.nan], inplace=True) + # will be inserted as a NA category: + c[0] = np.nan + s = Series(c) + s + pd.isnull(s) + s.fillna("a") + +Differences to R's `factor` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following differences to R's factor functions can be observed: + +* R's `levels` are named `categories` +* R's `levels` are always of type string, while `categories` in pandas can be of any dtype. +* New categorical data is automatically ordered if the passed in values are sortable or a + `categories` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). +* It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` + afterwards. +* In contrast to R's `factor` function, using categorical data as the sole input to create a + new categorical series will *not* remove unused categories but create a new categorical series + which is equal to the passed in one! Gotchas ------- -`Categorical` is not a `numpy` array -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _categorical.rfactor: -Currently, `Categorical` and the corresponding ``category`` `Series` is implemented as a python -object and not as a low level `numpy` array dtype. This leads to some problems. +Memory Usage +~~~~~~~~~~~~ -`numpy` itself doesn't know about the new `dtype`: +.. _categorical.memory: + +The memory usage of a ``Categorical`` is proportional to the number of categories times the length of the data. In contrast, +an ``object`` dtype is a constant times the length of the data. .. ipython:: python - try: - np.dtype("category") - except TypeError as e: - print("TypeError: " + str(e)) + s = Series(['foo','bar']*1000) - dtype = pd.Categorical(["a"]).dtype - try: - np.dtype(dtype) - except TypeError as e: - print("TypeError: " + str(e)) + # object dtype + s.nbytes - # dtype comparisons work: - dtype == np.str_ - np.str_ == dtype + # category dtype + s.astype('category').nbytes -Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` -are not numeric data (even in the case that ``.levels`` is numeric). +.. note:: -.. ipython:: python + If the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an + equivalent ``object`` dtype representation. - s = pd.Series(pd.Categorical([1,2,3,4])) - try: - np.sum(s) - #same with np.log(s),.. - except TypeError as e: - print("TypeError: " + str(e)) + .. ipython:: python -.. note:: - If such a function works, please file a bug at https://github.com/pydata/pandas! + s = Series(['foo%04d' % i for i in range(2000)]) + # object dtype + s.nbytes -Side effects -~~~~~~~~~~~~ + # category dtype + s.astype('category').nbytes -Constructing a `Series` from a `Categorical` will not copy the input `Categorical`. This -means that changes to the `Series` will in most cases change the original `Categorical`: -.. ipython:: python +Old style constructor usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~ - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) - s = pd.Series(cat, name="cat") - cat - s.iloc[0:2] = 10 - cat - df = pd.DataFrame(s) - df["cat"].cat.levels = [1,2,3,4,5] - cat +In earlier versions than pandas 0.15, a `Categorical` could be constructed by passing in precomputed +`codes` (called then `labels`) instead of values with categories. The `codes` were interpreted as +pointers to the categories with `-1` as `NaN`. This type of constructor useage is replaced by +the special constructor :func:`Categorical.from_codes`. -Use ``copy=True`` to prevent such a behaviour: +Unfortunately, in some special cases, using code which assumes the old style constructor usage +will work with the current pandas version, resulting in subtle bugs: -.. ipython:: python +.. code-block:: python - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) - s = pd.Series(cat, name="cat", copy=True) - cat - s.iloc[0:2] = 10 - cat + >>> cat = Categorical([1,2], [1,2,3]) + >>> # old version + >>> cat.get_values() + array([2, 3], dtype=int64) + >>> # new version + >>> cat.get_values() + array([1, 2], dtype=int64) -.. note:: - This also happens in some cases when you supply a `numpy` array instea dof a `Categorical`: - using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, but using - a string array (e.g. ``np.array(["a","b","c","a"])``) will not. +.. warning:: + If you used `Categoricals` with older versions of pandas, please audit your code before + upgrading and change your code to use the :func:`~pandas.Categorical.from_codes` + constructor. +`Categorical` is not a `numpy` array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Danger of confusion -~~~~~~~~~~~~~~~~~~~ +Currently, categorical data and the underlying `Categorical` is implemented as a python +object and not as a low-level `numpy` array dtype. This leads to some problems. -Both `Series` and `Categorical` have a method ``.reorder_levels()`` but for different things. For -Series of type ``category`` this means that there is some danger to confuse both methods. +`numpy` itself doesn't know about the new `dtype`: .. ipython:: python - s = pd.Series(pd.Categorical([1,2,3,4])) - print(s.cat.levels) - # wrong and raises an error: try: - s.reorder_levels([4,3,2,1]) - except Exception as e: - print("Exception: " + str(e)) - # right - s.cat.reorder_levels([4,3,2,1]) - print(s.cat.levels) - -See also the API documentation for :func:`pandas.Series.reorder_levels` and -:func:`pandas.Categorical.reorder_levels` + np.dtype("category") + except TypeError as e: + print("TypeError: " + str(e)) -Old style constructor usage -~~~~~~~~~~~~~~~~~~~~~~~~~~~ + dtype = Categorical(["a"]).dtype + try: + np.dtype(dtype) + except TypeError as e: + print("TypeError: " + str(e)) -I earlier versions, a `Categorical` could be constructed by passing in precomputed `codes` -(called then `labels`) instead of values with levels. The `codes` are interpreted as pointers -to the levels with `-1` as `NaN`. This usage is now deprecated and not available unless -``compat=True`` is passed to the constructor of `Categorical`. +Dtype comparisons work: .. ipython:: python - :okwarning: - # This raises a FutureWarning: - cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) - cat.get_values() + dtype == np.str_ + np.str_ == dtype -In the default case (``compat=False``) the first argument is interpreted as values. +Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` +are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python - cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) - cat.get_values() + s = Series(Categorical([1,2,3,4])) + try: + np.sum(s) + #same with np.log(s),.. + except TypeError as e: + print("TypeError: " + str(e)) -.. warning:: - Using Categorical with precomputed codes and levels is deprecated and a `FutureWarning` - is raised. Please change your code to use the :func:`~pandas.Categorical.from_codes` - constructor instead of adding ``compat=False``. +.. note:: + If such a function works, please file a bug at https://github.com/pydata/pandas! + +dtype in apply +~~~~~~~~~~~~~~ + +Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +basic type) and applying along columns will also convert to object. + +.. ipython:: python + + df = DataFrame({"a":[1,2,3,4], + "b":["a","b","c","d"], + "cats":Categorical([1,2,3,2])}) + df.apply(lambda row: type(row["cats"]), axis=1) + df.apply(lambda col: col.dtype, axis=0) -No categorical index +No Categorical Index ~~~~~~~~~~~~~~~~~~~~ -There is currently no index of type ``category``, so setting the index to a `Categorical` will -convert the `Categorical` to a normal `numpy` array first and therefore remove any custom -ordering of the levels: +There is currently no index of type ``category``, so setting the index to categorical column will +convert the categorical data to a "normal" dtype first and therefore remove any custom +ordering of the categories: .. ipython:: python - cats = pd.Categorical([1,2,3,4], levels=[4,2,3,1]) + cats = Categorical([1,2,3,4], categories=[4,2,3,1]) strings = ["a","b","c","d"] values = [4,2,3,1] - df = pd.DataFrame({"strings":strings, "values":values}, index=cats) + df = DataFrame({"strings":strings, "values":values}, index=cats) df.index - # This should sort by levels but does not as there is no CategoricalIndex! + # This should sort by categories but does not as there is no CategoricalIndex! df.sort_index() .. note:: This could change if a `CategoricalIndex` is implemented (see https://github.com/pydata/pandas/issues/7629) -dtype in apply -~~~~~~~~~~~~~~ -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get -a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a -basic type) and applying along columns will also convert to object. +Side Effects +~~~~~~~~~~~~ + +Constructing a `Series` from a `Categorical` will not copy the input `Categorical`. This +means that changes to the `Series` will in most cases change the original `Categorical`: .. ipython:: python - df = pd.DataFrame({"a":[1,2,3,4], "b":["a","b","c","d"], "cats":pd.Categorical([1,2,3,2])}) - df.apply(lambda row: type(row["cats"]), axis=1) - df.apply(lambda col: col.dtype, axis=0) + cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = Series(cat, name="cat") + cat + s.iloc[0:2] = 10 + cat + df = DataFrame(s) + df["cat"].cat.categories = [1,2,3,4,5] + cat +Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categoricals`: -Future compatibility -~~~~~~~~~~~~~~~~~~~~ +.. ipython:: python + + cat = Categorical([1,2,3,10], categories=[1,2,3,4,10]) + s = Series(cat, name="cat", copy=True) + cat + s.iloc[0:2] = 10 + cat + +.. note:: + This also happens in some cases when you supply a `numpy` array instead of a `Categorical`: + using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using + a string array (e.g. ``np.array(["a","b","c","a"])``) will not. -As `Categorical` is not a native `numpy` dtype, the implementation details of -`Series.cat` can change if such a `numpy` dtype is implemented. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 84bba77e0dfa3..8462bacef47d2 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - options.display.max_rows=15 + pd.options.display.max_rows=15 Comparison with R / R libraries ******************************* @@ -51,7 +51,7 @@ Selecting multiple columns by name in ``pandas`` is straightforward .. ipython:: python - df = DataFrame(np.random.randn(10, 3), columns=list('abc')) + df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) df[['a', 'c']] df.loc[:, ['a', 'c']] @@ -63,7 +63,7 @@ with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. named = list('abcdefg') n = 30 columns = named + np.arange(len(named), n).tolist() - df = DataFrame(np.random.randn(n, n), columns=columns) + df = pd.DataFrame(np.random.randn(n, n), columns=columns) df.iloc[:, np.r_[:10, 24:30]] @@ -88,8 +88,7 @@ function. .. ipython:: python - from pandas import DataFrame - df = DataFrame({ + df = pd.DataFrame({ 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], @@ -166,7 +165,7 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = DataFrame({ + baseball = pd.DataFrame({ 'team': ["team %d" % (x+1) for x in range(5)]*5, 'player': random.sample(list(string.ascii_lowercase),25), 'batting avg': np.random.uniform(.200, .400, 25) @@ -197,7 +196,7 @@ index/slice as well as standard boolean indexing: .. ipython:: python - df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') df[df.a <= df.b] df.loc[df.a <= df.b] @@ -225,7 +224,7 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') df.a + df.b # same as the previous expression @@ -283,7 +282,7 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'x': np.random.uniform(1., 168., 120), 'y': np.random.uniform(7., 334., 120), 'z': np.random.uniform(1.7, 20.7, 120), @@ -317,7 +316,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -336,7 +335,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python a = list(enumerate(list(range(1,5))+[np.NAN])) - DataFrame(a) + pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures documentation `. @@ -361,7 +360,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = DataFrame({'first' : ['John', 'Mary'], + cheese = pd.DataFrame({'first' : ['John', 'Mary'], 'last' : ['Doe', 'Bo'], 'height' : [5.5, 6.0], 'weight' : [130, 150]}) @@ -394,7 +393,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'x': np.random.uniform(1., 168., 12), 'y': np.random.uniform(7., 334., 12), 'z': np.random.uniform(1.7, 20.7, 12), @@ -426,7 +425,7 @@ using :meth:`~pandas.pivot_table`: .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', 'Animal2', 'Animal3'], 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], @@ -444,6 +443,30 @@ The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. +|factor|_ +~~~~~~~~~ + +.. versionadded:: 0.15 + +pandas has a data type for categorical data. + +.. code-block:: r + + cut(c(1,2,3,4,5,6), 3) + factor(c(1,2,3,2,2,3)) + +In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: + +.. ipython:: python + + pd.cut(pd.Series([1,2,3,4,5,6]), 3) + pd.Series([1,2,3,2,2,3]).astype("category") + +For more details and examples see :ref:`categorical introduction ` and the +:ref:`API documentation `. There is also a documentation regarding the +:ref:`differences to R's factor `. + + .. |c| replace:: ``c`` .. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html @@ -477,3 +500,5 @@ For more details and examples see :ref:`the reshaping documentation .. |cast| replace:: ``cast`` .. cast: http://www.inside-r.org/packages/cran/reshape2/docs/cast +.. |factor| replace:: ``factor`` +.. _factor: https://stat.ethz.ch/R-manual/R-devel/library/base/html/factor.html \ No newline at end of file diff --git a/doc/source/computation.rst b/doc/source/computation.rst index d5dcacf53ec23..759675c51b960 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -310,7 +310,7 @@ keyword. The list of recognized types are: rolling_window(ser, 5, 'triang') -Note that the ``boxcar`` window is equivalent to ``rolling_mean``: +Note that the ``boxcar`` window is equivalent to ``rolling_mean``. .. ipython:: python @@ -336,6 +336,19 @@ This keyword is available in other rolling functions as well. rolling_mean(ser, 5, center=True) +.. _stats.moments.normalization: + +.. note:: + + In rolling sum mode (``mean=False``) there is no normalization done to the + weights. Passing custom weights of ``[1, 1, 1]`` will yield a different + result than passing weights of ``[2, 2, 2]``, for example. When passing a + ``win_type`` instead of explicitly specifying the weights, the weights are + already normalized so that the largest weight is 1. + + In contrast, the nature of the rolling mean calculation (``mean=True``)is + such that the weights are normalized with respect to each other. Weights + of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. .. _stats.moments.binary: @@ -413,6 +426,8 @@ columns using ``ix`` indexing: @savefig rolling_corr_pairwise_ex.png correls.ix[:, 'A', 'C'].plot() +.. _stats.moments.expanding: + Expanding window moment functions --------------------------------- A common alternative to rolling statistics is to use an *expanding* window, @@ -485,60 +500,79 @@ relative impact of an individual data point. As an example, here is the @savefig expanding_mean_frame.png expanding_mean(ts).plot(style='k') +.. _stats.moments.exponentially_weighted: + Exponentially weighted moment functions --------------------------------------- -A related set of functions are exponentially weighted versions of many of the -above statistics. A number of EW (exponentially weighted) functions are -provided using the blending method. For example, where :math:`y_t` is the -result and :math:`x_t` the input, we compute an exponentially weighted moving -average as +A related set of functions are exponentially weighted versions of several of +the above statistics. A number of expanding EW (exponentially weighted) +functions are provided: + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``ewma``, EW moving average + ``ewmvar``, EW moving variance + ``ewmstd``, EW moving standard deviation + ``ewmcorr``, EW moving correlation + ``ewmcov``, EW moving covariance + +In general, a weighted moving average is calculated as .. math:: - y_t = (1 - \alpha) y_{t-1} + \alpha x_t + y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i}, -One must have :math:`0 < \alpha \leq 1`, but rather than pass :math:`\alpha` -directly, it's easier to think about either the **span**, **center of mass -(com)** or **halflife** of an EW moment: +where :math:`x_t` is the input at :math:`y_t` is the result. + +The EW functions support two variants of exponential weights: +The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i`. +When ``adjust=False`` is specified, moving averages are calculated as .. math:: - \alpha = - \begin{cases} - \frac{2}{s + 1}, s = \text{span}\\ - \frac{1}{1 + c}, c = \text{center of mass}\\ - 1 - \exp^{\frac{\log 0.5}{h}}, h = \text{half life} + y_0 &= x_0 \\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + +which is equivalent to using weights + +.. math:: + + w_i = \begin{cases} + \alpha (1 - \alpha)^i & \text{if } i < t \\ + (1 - \alpha)^i & \text{if } i = t. \end{cases} .. note:: - the equation above is sometimes written in the form + These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g. + + .. math:: + + y_t = \alpha' y_{t-1} + (1 - \alpha') x_t. - .. math:: +One must have :math:`0 < \alpha \leq 1`, but rather than pass :math:`\alpha` +directly, it's easier to think about either the **span**, **center of mass +(com)** or **halflife** of an EW moment: - y_t = \alpha' y_{t-1} + (1 - \alpha') x_t +.. math:: - where :math:`\alpha' = 1 - \alpha`. + \alpha = + \begin{cases} + \frac{2}{s + 1}, & s = \text{span}\\ + \frac{1}{1 + c}, & c = \text{center of mass}\\ + 1 - \exp^{\frac{\log 0.5}{h}}, & h = \text{half life} + \end{cases} -You can pass one of the three to these functions but not more. **Span** +One must specify precisely one of the three to the EW functions. **Span** corresponds to what is commonly called a "20-day EW moving average" for example. **Center of mass** has a more physical interpretation. For example, **span** = 20 corresponds to **com** = 9.5. **Halflife** is the period of -time for the exponential weight to reduce to one half. Here is the list of -functions available: +time for the exponential weight to reduce to one half. -.. csv-table:: - :header: "Function", "Description" - :widths: 20, 80 - - ``ewma``, EW moving average - ``ewmvar``, EW moving variance - ``ewmstd``, EW moving standard deviation - ``ewmcorr``, EW moving correlation - ``ewmcov``, EW moving covariance - -Here are an example for a univariate time series: +Here is an example for a univariate time series: .. ipython:: python @@ -548,8 +582,45 @@ Here are an example for a univariate time series: @savefig ewma_ex.png ewma(ts, span=20).plot(style='k') -.. note:: +All the EW functions have a ``min_periods`` argument, which has the same +meaning it does for all the ``expanding_`` and ``rolling_`` functions: +no output values will be set until at least ``min_periods`` non-null values +are encountered in the (expanding) window. +(This is a change from versions prior to 0.15.0, in which the ``min_periods`` +argument affected only the ``min_periods`` consecutive entries starting at the +first non-null value.) + +All the EW functions also have an ``ignore_na`` argument, which deterines how +intermediate null values affect the calculation of the weights. +When ``ignore_na=False`` (the default), weights are calculated based on absolute +positions, so that intermediate null values affect the result. +When ``ignore_na=True`` (which reproduces the behavior in versions prior to 0.15.0), +weights are calculated by ignoring intermediate null values. +For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted +average of ``3, NaN, 5`` would be calculated as + +.. math:: + + \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1} + +Whereas if ``ignore_na=True``, the weighted average would be calculated as + +.. math:: + + \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}. + +The ``ewmvar``, ``ewmstd``, and ``ewmcov`` functions have a ``bias`` argument, +specifying whether the result should contain biased or unbiased statistics. +For example, if ``bias=True``, ``ewmvar(x)`` is calculated as +``ewmvar(x) = ewma(x**2) - ewma(x)**2``; +whereas if ``bias=False`` (the default), the biased variance statistics +are scaled by debiasing factors + +.. math:: + + \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}. - The EW functions perform a standard adjustment to the initial observations - whereby if there are fewer observations than called for in the span, those - observations are reweighted accordingly. +(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, +with :math:`N = t + 1`.) +See http://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance +for further details. diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 2c1f96d91c45f..edff461d7989d 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -5,28 +5,42 @@ .. ipython:: python :suppress: + import pandas as pd import numpy as np + import random import os + import itertools + import functools + import datetime + np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - options.display.mpl_style='default' - import pandas as pd - randn = np.random.randn - randint = np.random.randint - np.set_printoptions(precision=4, suppress=True) + + pd.options.display.max_rows=15 + pd.options.display.mpl_style='default' + np.set_printoptions(precision=4, suppress=True) + + ******** Cookbook ******** This is a repository for *short and sweet* examples and links for useful pandas recipes. -We encourage users to add to this documentation. +We encourage users to add to this documentation. + +Adding interesting links and/or inline examples to this section is a great *First Pull Request*. -This is a great *First Pull Request* (to add interesting links and/or put short code inline -for existing links) +Simplified, condensed, new-user friendly, in-line examples have been inserted where possible to +augment the Stack-Overflow and GitHub links. Many of the links contain expanded information, +above what the in-line examples offer. +Pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept +explicitly imported for newer users. + +These examples are written for python 3.4. Minor tweaks might be necessary for earlier python +versions. + Idioms ------ @@ -34,92 +48,370 @@ Idioms These are some neat pandas ``idioms`` -`How to do if-then-else? +`if-then/if-then-else on one column, and assignment to another one or more columns: `__ -`How to do if-then-else #2 +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + +if-then... +********** + +An if-then on one column + +.. ipython:: python + + df.ix[df.AAA >= 5,'BBB'] = -1; df + +An if-then with assignment to 2 columns: + +.. ipython:: python + + df.ix[df.AAA >= 5,['BBB','CCC']] = 555; df + +Add another line with different logic, to do the -else + +.. ipython:: python + + df.ix[df.AAA < 5,['BBB','CCC']] = 2000; df + +Or use pandas where after you've set up a mask + +.. ipython:: python + + df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2}) + df.where(df_mask,-1000) + +`if-then-else using numpy's where() `__ -`How to split a frame with a boolean criterion? +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + + df['logic'] = np.where(df['AAA'] > 5,'high','low'); df + +Splitting +********* + +`Split a frame with a boolean criterion `__ -`How to select from a frame with complex criteria? +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + + dflow = df[df.AAA <= 5] + dfhigh = df[df.AAA > 5] + + dflow; dfhigh + +Building Criteria +***************** + +`Select with multi-column criteria `__ -`Select rows closest to a user-defined number +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + +...and (without assignment returns a Series) + +.. ipython:: python + + newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries + +...or (without assignment returns a Series) + +.. ipython:: python + + newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries; + +...or (with assignment modifies the DataFrame.) + +.. ipython:: python + + df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1; df + +`Select rows with data closest to certain value using argsort `__ -`How to reduce a sequence (e.g. of Series) using a binary operator +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + + aValue = 43.0 + df.ix[(df.CCC-aValue).abs().argsort()] + +`Dynamically reduce a list of criteria using a binary operators `__ +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + + Crit1 = df.AAA <= 5.5 + Crit2 = df.BBB == 10.0 + Crit3 = df.CCC > -40.0 + +One could hard code: + +.. ipython:: python + + AllCrit = Crit1 & Crit2 & Crit3 + +...Or it can be done with a list of dynamically built criteria + +.. ipython:: python + + CritList = [Crit1,Crit2,Crit3] + AllCrit = functools.reduce(lambda x,y: x & y, CritList) + + df[AllCrit] .. _cookbook.selection: Selection --------- +DataFrames +********** + The :ref:`indexing ` docs. -`Indexing using both row labels and conditionals +`Using both row labels and value conditionals `__ +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + + df[(df.AAA <= 6) & (df.index.isin([0,2,4]))] + `Use loc for label-oriented slicing and iloc positional slicing `__ +.. ipython:: python + + data = {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]} + df = pd.DataFrame(data=data,index=['foo','bar','boo','kar']); df + +There are 2 explicit slicing methods, with a third general case + +1. Positional-oriented (Python slicing style : exclusive of end) +2. Label-oriented (Non-Python slicing style : inclusive of end) +3. General (Either slicing style : depends on if the slice contains labels or positions) + +.. ipython:: python + df.iloc[0:3] #Positional + + df.loc['bar':'kar'] #Label + + #Generic + df.ix[0:3] #Same as .iloc[0:3] + df.ix['bar':'kar'] #Same as .loc['bar':'kar'] + +Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. + +.. ipython:: python + + df2 = pd.DataFrame(data=data,index=[1,2,3,4]); #Note index starts at 1. + + df2.iloc[1:3] #Position-oriented + + df2.loc[1:3] #Label-oriented + + df2.ix[1:3] #General, will mimic loc (label-oriented) + df2.ix[0:3] #General, will mimic iloc (position-oriented), as loc[0:3] would raise a KeyError + +`Using inverse operator (~) to take the complement of a mask +`__ + +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40], 'CCC' : [100,50,-30,-50]}); df + + df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))] + +Panels +****** + `Extend a panel frame by transposing, adding a new dimension, and transposing back to the original dimensions `__ +.. ipython:: python + + rng = pd.date_range('1/1/2013',periods=100,freq='D') + data = np.random.randn(100, 4) + cols = ['A','B','C','D'] + df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols) + + pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf + + #Assignment using Transpose (pandas < 0.15) + pf = pf.transpose(2,0,1) + pf['E'] = pd.DataFrame(data, rng, cols) + pf = pf.transpose(1,2,0);pf + + #Direct assignment (pandas > 0.15) + pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf + `Mask a panel by using np.where and then reconstructing the panel with the new masked values `__ -`Using ~ to take the complement of a boolean array, see -`__ +New Columns +*********** -`Efficiently creating columns using applymap +`Efficiently and dynamically creating new columns using applymap `__ +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]}); df + + source_cols = df.columns # or some subset would work too. + new_cols = [str(x) + "_cat" for x in source_cols] + categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' } + + df[new_cols] = df[source_cols].applymap(categories.get);df + `Keep other columns when using min() with groupby `__ +.. ipython:: python + + df = pd.DataFrame( + {'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]}); df + +Method 1 : idxmin() to get the index of the mins + +.. ipython:: python + + df.loc[df.groupby("AAA")["BBB"].idxmin()] + +Method 2 : sort then take first of each + +.. ipython:: python + + df.sort("BBB").groupby("AAA", as_index=False).first() + +Notice the same results, with the exception of the index. + .. _cookbook.multi_index: MultiIndexing ------------- -The :ref:`multindexing ` docs. +The :ref:`multindexing ` docs. `Creating a multi-index from a labeled frame `__ +.. ipython:: python + + df = pd.DataFrame({'row' : [0,1,2], + 'One_X' : [1.1,1.1,1.1], + 'One_Y' : [1.2,1.2,1.2], + 'Two_X' : [1.11,1.11,1.11], + 'Two_Y' : [1.22,1.22,1.22]}); df + + # As Labelled Index + df = df.set_index('row');df + # With Heirarchical Columns + df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]);df + # Now stack & Reset + df = df.stack(0).reset_index(1);df + # And fix the labels (Notice the label 'level_1' got added automatically) + df.columns = ['Sample','All_X','All_Y'];df + Arithmetic -~~~~~~~~~~ +********** `Performing arithmetic with a multi-index that needs broadcasting `__ +.. ipython:: python + + cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']]) + df = pd.DataFrame(np.random.randn(2,6),index=['n','m'],columns=cols); df + df = df.div(df['C'],level=1); df + Slicing -~~~~~~~ +******* `Slicing a multi-index with xs `__ -`Slicing a multi-index with xs #2 +.. ipython:: python + + coords = [('AA','one'),('AA','six'),('BB','one'),('BB','two'),('BB','six')] + index = pd.MultiIndex.from_tuples(coords) + df = pd.DataFrame([11,22,33,44,55],index,['MyData']); df + +To take the cross section of the 1st level and 1st axis the index: + +.. ipython:: python + + df.xs('BB',level=0,axis=0) #Note : level and axis are optional, and default to zero + +...and now the 2nd level of the 1st axis. + +.. ipython:: python + + df.xs('six',level=1,axis=0) + +`Slicing a multi-index with xs, method #2 `__ +.. ipython:: python + + index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci'])) + headr = list(itertools.product(['Exams','Labs'],['I','II'])) + + indx = pd.MultiIndex.from_tuples(index,names=['Student','Course']) + cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named + + data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)] + + df = pd.DataFrame(data,indx,cols); df + + All = slice(None) + + df.loc['Violet'] + df.loc[(All,'Math'),All] + df.loc[(slice('Ada','Quinn'),'Math'),All] + df.loc[(All,'Math'),('Exams')] + df.loc[(All,'Math'),(All,'II')] + `Setting portions of a multi-index with xs `__ Sorting -~~~~~~~ +******* -`Multi-index sorting +`Sort by specific column or an ordered list of columns, with a multi-index `__ -`Partial Selection, the need for sortedness +.. ipython:: python + + df.sort(('Labs', 'II'), ascending=False) + +`Partial Selection, the need for sortedness; `__ Levels -~~~~~~ +****** `Prepending a level to a multiindex `__ @@ -128,7 +420,7 @@ Levels `__ panelnd -~~~~~~~ +******* The :ref:`panelnd` docs. @@ -155,7 +447,7 @@ Fill forward a reversed timeseries `__ Replace -~~~~~~~ +******* `Using replace with backrefs `__ @@ -170,43 +462,134 @@ The :ref:`grouping ` docs. `Basic grouping with apply `__ +Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to all the columns + +.. ipython:: python + + df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), + 'size': list('SSMMMLL'), + 'weight': [8, 10, 11, 1, 20, 12, 12], + 'adult' : [False] * 5 + [True] * 2}); df + + #List the size of the animals with the highest weight. + df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) + `Using get_group `__ +.. ipython:: python + + gb = df.groupby(['animal']) + + gb.get_group('cat') + `Apply to different items in a group `__ +.. ipython:: python + + def GrowUp(x): + avg_weight = sum(x[x.size == 'S'].weight * 1.5) + avg_weight += sum(x[x.size == 'M'].weight * 1.25) + avg_weight += sum(x[x.size == 'L'].weight) + avg_weight = avg_weight / len(x) + return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult']) + + expected_df = gb.apply(GrowUp) + + expected_df + `Expanding Apply `__ -`Replacing values with groupby means +.. ipython:: python + + S = pd.Series([i / 100.0 for i in range(1,11)]) + + def CumRet(x,y): + return x * (1 + y) + + def Red(x): + return functools.reduce(CumRet,x,1.0) + + pd.expanding_apply(S, Red) + + +`Replacing some values with mean of the rest of a group `__ -`Sort by group with aggregation +.. ipython:: python + + df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]}) + + gb = df.groupby('A') + + def replace(g): + mask = g < 0 + g.loc[mask] = g[~mask].mean() + return g + + gb.transform(replace) + +`Sort groups by aggregated data `__ +.. ipython:: python + + df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2, + 'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], + 'flag': [False, True] * 3}) + + code_groups = df.groupby('code') + + agg_n_sort_order = code_groups[['data']].transform(sum).sort('data') + + sorted_df = df.ix[agg_n_sort_order.index] + + sorted_df + `Create multiple aggregated columns `__ +.. ipython:: python + + rng = pd.date_range(start="2014-10-07",periods=10,freq='2min') + ts = pd.Series(data = list(range(10)), index = rng) + + def MyCust(x): + if len(x) > 2: + return x[1] * 1.234 + return pd.NaT + + mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust} + ts.resample("5min",how = mhc) + ts + `Create a value counts column and reassign back to the DataFrame `__ +.. ipython:: python + + df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), + 'Value': [100, 150, 50, 50]}); df + df['Counts'] = df.groupby(['Color']).transform(len) + df + `Shift groups of the values in a column based on the index `__ .. ipython:: python df = pd.DataFrame( - {u'line_race': [10L, 10L, 8L, 10L, 10L, 8L], - u'beyer': [99L, 102L, 103L, 103L, 88L, 100L]}, - index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', - u'Paynter', u'Paynter', u'Paynter']); df - + {u'line_race': [10, 10, 8, 10, 10, 8], + u'beyer': [99, 102, 103, 103, 88, 100]}, + index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', + u'Paynter', u'Paynter', u'Paynter']); df df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) df Expanding Data -~~~~~~~~~~~~~~ +************** `Alignment and to-date `__ @@ -218,35 +601,109 @@ Expanding Data `__ Splitting -~~~~~~~~~ +********* `Splitting a frame `__ +Create a list of dataframes, split using a delineation based on logic included in rows. + +.. ipython:: python + + df = pd.DataFrame(data={'Case' : ['A','A','A','B','A','A','B','A','A'], + 'Data' : np.random.randn(9)}) + + dfs = list(zip(*df.groupby(pd.rolling_median((1*(df['Case']=='B')).cumsum(),3,True))))[-1] + + dfs[0] + dfs[1] + dfs[2] + .. _cookbook.pivot: Pivot -~~~~~ +***** The :ref:`Pivot ` docs. `Partial sums and subtotals `__ +.. ipython:: python + + df = pd.DataFrame(data={'Province' : ['ON','QC','BC','AL','AL','MN','ON'], + 'City' : ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'], + 'Sales' : [13,6,16,8,4,3,1]}) + table = pd.pivot_table(df,values=['Sales'],index=['Province'],columns=['City'],aggfunc=np.sum,margins=True) + table.stack('City') + `Frequency table like plyr in R `__ +.. ipython:: python + + grades = [48,99,75,80,42,80,72,68,36,78] + df = pd.DataFrame( {'ID': ["x%d" % r for r in range(10)], + 'Gender' : ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'], + 'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'], + 'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'], + 'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'], + 'Passed': ['yes' if x > 50 else 'no' for x in grades], + 'Employed': [True,True,True,False,False,False,False,True,True,False], + 'Grade': grades}) + + df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], + 'Passed': lambda x: sum(x == 'yes'), + 'Employed' : lambda x : sum(x), + 'Grade' : lambda x : sum(x) / len(x)}) + Apply -~~~~~ +***** -`Turning embedded lists into a multi-index frame +`Rolling Apply to Organize - Turning embedded lists into a multi-index frame `__ -`Rolling apply with a DataFrame returning a Series +.. ipython:: python + + df = pd.DataFrame(data={'A' : [[2,4,8,16],[100,200],[10,20,30]], 'B' : [['a','b','c'],['jj','kk'],['ccc']]},index=['I','II','III']) + + def SeriesFromSubList(aList): + return pd.Series(aList) + + df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ])) + +`Rolling Apply with a DataFrame returning a Series `__ +Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned + +.. ipython:: python + + df = pd.DataFrame(data=np.random.randn(2000,2)/10000, + index=pd.date_range('2001-01-01',periods=2000), + columns=['A','B']); df + + def gm(aDF,Const): + v = ((((aDF.A+aDF.B)+1).cumprod())-1)*Const + return (aDF.index[0],v.iloc[-1]) + + S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ])); S + `Rolling apply with a DataFrame returning a Scalar `__ +Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price) + +.. ipython:: python + + rng = pd.date_range(start = '2014-01-01',periods = 100) + df = pd.DataFrame({'Open' : np.random.randn(len(rng)), + 'Close' : np.random.randn(len(rng)), + 'Volume' : np.random.randint(100,2000,len(rng))}, index=rng); df + + def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2) + window = 5 + s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ]); s + Timeseries ---------- @@ -273,13 +730,13 @@ Calculate the first day of the month for each entry in a DatetimeIndex .. ipython:: python - dates = pd.date_range('2000-01-01', periods=5) - dates.to_period(freq='M').to_timestamp() + dates = pd.date_range('2000-01-01', periods=5) + dates.to_period(freq='M').to_timestamp() .. _cookbook.resample: Resampling -~~~~~~~~~~ +********** The :ref:`Resample ` docs. @@ -310,12 +767,35 @@ Merge The :ref:`Concat ` docs. The :ref:`Join ` docs. -`emulate R rbind +`Append two dataframes with overlapping index (emulate R rbind) `__ -`Self Join +.. ipython:: python + + rng = pd.date_range('2000-01-01', periods=6) + df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C']) + df2 = df1.copy() + +ignore_index is needed in pandas < v0.13, and depending on df construction + +.. ipython:: python + + df = df1.append(df2,ignore_index=True); df + +`Self Join of a DataFrame `__ +.. ipython:: python + + df = pd.DataFrame(data={'Area' : ['A'] * 5 + ['C'] * 2, + 'Bins' : [110] * 2 + [160] * 3 + [40] * 2, + 'Test_0' : [0, 1, 0, 1, 2, 0, 1], + 'Data' : np.random.randn(7)});df + + df['Test_1'] = df['Test_0'] - 1 + + pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R')) + `How to set the index and join `__ @@ -361,19 +841,17 @@ The :ref:`Plotting ` docs. .. ipython:: python - df = pd.DataFrame( + df = pd.DataFrame( {u'stratifying_var': np.random.uniform(0, 100, 20), - u'price': np.random.normal(100, 5, 20)} - ) - df[u'quartiles'] = pd.qcut( - df[u'stratifying_var'], - 4, - labels=[u'0-25%', u'25-50%', u'50-75%', u'75-100%'] - ) - - @savefig quartile_boxplot.png - df.boxplot(column=u'price', by=u'quartiles') + u'price': np.random.normal(100, 5, 20)}) + + df[u'quartiles'] = pd.qcut( + df[u'stratifying_var'], + 4, + labels=[u'0-25%', u'25-50%', u'50-75%', u'75-100%']) + @savefig quartile_boxplot.png + df.boxplot(column=u'price', by=u'quartiles') Data In/Out ----------- @@ -384,7 +862,7 @@ Data In/Out .. _cookbook.csv: CSV -~~~ +*** The :ref:`CSV ` docs @@ -464,7 +942,7 @@ Parsing date components in multi-columns is faster with a format .. _cookbook.sql: SQL -~~~ +*** The :ref:`SQL ` docs @@ -474,7 +952,7 @@ The :ref:`SQL ` docs .. _cookbook.excel: Excel -~~~~~ +***** The :ref:`Excel ` docs @@ -489,7 +967,7 @@ header `__ .. _cookbook.hdf: HDFStore -~~~~~~~~ +******** The :ref:`HDFStores ` docs @@ -519,9 +997,12 @@ csv file and creating a store by chunks, with date parsing as well. `Reading in a sequence of files, then providing a global unique index to a store while appending `__ -`Groupby on a HDFStore +`Groupby on a HDFStore with low group density `__ +`Groupby on a HDFStore with high group density +`__ + `Hierarchical queries on a HDFStore `__ @@ -541,25 +1022,25 @@ Storing Attributes to a group node .. ipython:: python - df = DataFrame(np.random.randn(8,3)) - store = HDFStore('test.h5') - store.put('df',df) - - # you can store an arbitrary python object via pickle - store.get_storer('df').attrs.my_attribute = dict(A = 10) - store.get_storer('df').attrs.my_attribute + df = pd.DataFrame(np.random.randn(8,3)) + store = pd.HDFStore('test.h5') + store.put('df',df) + + # you can store an arbitrary python object via pickle + store.get_storer('df').attrs.my_attribute = dict(A = 10) + store.get_storer('df').attrs.my_attribute .. ipython:: python :suppress: - store.close() - os.remove('test.h5') + store.close() + os.remove('test.h5') .. _cookbook.binary: Binary Files -~~~~~~~~~~~~ +************ pandas readily accepts numpy record arrays, if you need to read in a binary file consisting of an array of C structs. For example, given this C program @@ -603,9 +1084,6 @@ in the frame: .. code-block:: python - import numpy as np - from pandas import DataFrame - names = 'count', 'avg', 'scale' # note that the offsets are larger than the size of the type because of @@ -614,7 +1092,7 @@ in the frame: formats = 'i4', 'f8', 'f4' dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, align=True) - df = DataFrame(np.fromfile('binary.dat', dt)) + df = pd.DataFrame(np.fromfile('binary.dat', dt)) .. note:: @@ -630,19 +1108,55 @@ Computation `Numerical integration (sample-based) of a time series `__ -Miscellaneous -------------- +Timedeltas +---------- -The :ref:`Timedeltas ` docs. +The :ref:`Timedeltas ` docs. -`Operating with timedeltas +`Using timedeltas `__ -`Create timedeltas with date differences +.. ipython:: python + + s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + + s - s.max() + + s.max() - s + + s - datetime.datetime(2011,1,1,3,5) + + s + datetime.timedelta(minutes=5) + + datetime.datetime(2011,1,1,3,5) - s + + datetime.timedelta(minutes=5) + s + +`Adding and subtracting deltas and dates +`__ + +.. ipython:: python + + deltas = pd.Series([ datetime.timedelta(days=i) for i in range(3) ]) + + df = pd.DataFrame(dict(A = s, B = deltas)); df + + df['New Dates'] = df['A'] + df['B']; + + df['Delta'] = df['A'] - df['New Dates']; df + + df.dtypes + +`Another example `__ + +Values can be set to NaT using np.nan, similar to datetime -`Adding days to dates in a dataframe -`__ +.. ipython:: python + + y = s - s.shift(); y + + y[1] = np.nan; y Aliasing Axis Names ------------------- @@ -652,23 +1166,23 @@ To globally provide aliases for axis names, one can define these 2 functions: .. ipython:: python def set_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES[alias] = axis + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES[alias] = axis .. ipython:: python def clear_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES.pop(alias,None) - + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES.pop(alias,None) + .. ipython:: python - set_axis_alias(DataFrame,'columns', 'myaxis2') - df2 = DataFrame(randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) + set_axis_alias(pd.DataFrame,'columns', 'myaxis2') + df2 = pd.DataFrame(np.random.randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) df2.sum(axis='myaxis2') - clear_axis_alias(DataFrame,'columns', 'myaxis2') + clear_axis_alias(pd.DataFrame,'columns', 'myaxis2') Creating Example Data --------------------- @@ -679,15 +1193,13 @@ of the data values: .. ipython:: python - import itertools - def expand_grid(data_dict): - rows = itertools.product(*data_dict.values()) - return pd.DataFrame.from_records(rows, columns=data_dict.keys()) + def expand_grid(data_dict): + rows = itertools.product(*data_dict.values()) + return pd.DataFrame.from_records(rows, columns=data_dict.keys()) - df = expand_grid( - {'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']} - ) - df + df = expand_grid( + {'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']}) + df \ No newline at end of file diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 928de285982cf..44321375d31a2 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -828,7 +828,7 @@ Conversion to DataFrame ~~~~~~~~~~~~~~~~~~~~~~~ A Panel can be represented in 2D form as a hierarchically indexed -DataFrame. See the section :ref:`hierarchical indexing ` +DataFrame. See the section :ref:`hierarchical indexing ` for more on this. To convert a Panel to a DataFrame, use the ``to_frame`` method: diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e5009aeb1c6f6..e5afe1db9417f 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -18,6 +18,7 @@ tools in the PyData space. We'd like to make it easier for users to find these project, if you know of other substantial projects that you feel should be on this list, please let us know. + .. _ecosystem.stats: Statistics and Machine Learning @@ -34,7 +35,8 @@ Statsmodels leverages pandas objects as the underlying data container for comput `sklearn-pandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Use pandas DataFrames in your scikit-learn ML pipeline. +Use pandas DataFrames in your `scikit-learn `__ +ML pipeline. @@ -43,12 +45,13 @@ Use pandas DataFrames in your scikit-learn ML pipeline. Visualization ------------- -`Vincent `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Bokeh `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The `Vincent `__ project leverages `Vega `__ -(that in turn, leverages `d3 `__) to create plots . It has great support -for pandas data objects. +Bokeh is a Python interactive visualization library for large datasets that natively uses +the latest web technologies. Its goal is to provide elegant, concise construction of novel +graphics in the style of Protovis/D3, while delivering high-performance interactivity over +large data to thin clients. `yhat/ggplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -70,13 +73,63 @@ to cover. The `Seaborn `__ project builds on and `matplotlib `__ to provide easy plotting of data which extends to more advanced types of plots then those offered by pandas. -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Vincent `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `Vincent `__ project leverages `Vega `__ +(that in turn, leverages `d3 `__) to create plots . It has great support +for pandas data objects. + + +.. _ecosystem.ide: + +IDE +------ + +`IPython `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +IPython is an interactive command shell and distributed computing +environment. +IPython Notebook is a web application for creating IPython notebooks. +An IPython notebook is a JSON document containing an ordered list +of input/output cells which can contain code, text, mathematics, plots +and rich media. +IPython notebooks can be converted to a number of open standard output formats +(HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, +Python) through 'Download As' in the web interface and ``ipython nbconvert`` +in a shell. + +Pandas DataFrames implement ``_repr_html_`` methods +which are utilized by IPython Notebook for displaying +(abbreviated) HTML tables. (Note: HTML tables may or may not be +compatible with non-HTML IPython output formats.) + +`quantopian/qgrid `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +qgrid is "an interactive grid for sorting and filtering +DataFrames in IPython Notebook" built with SlickGrid. + +`Spyder `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spyder is a cross-platform Qt-based open-source Python IDE with +editing, testing, debugging, and introspection features. +Spyder can now introspect and display Pandas DataFrames and show +both "column wise min/max and global min/max coloring." + + +.. _ecosystem.api: + +API +----- + +`quandl/Python `_ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Quandl API for Python wraps the Quandl REST API to return +Pandas DataFrames with timeseries indexes. -Bokeh is a Python interactive visualization library for large datasets that natively uses -the latest web technologies. Its goal is to provide elegant, concise construction of novel -graphics in the style of Protovis/D3, while delivering high-performance interactivity over -large data to thin clients. .. _ecosystem.domain: @@ -89,3 +142,24 @@ Domain Specific Geopandas extends pandas data objects to include geographic information which support geometric operations. If your work entails maps and geographical coordinates, and you love pandas, you should take a close look at Geopandas. + +`xray `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +xray brings the labeled data power of pandas to the physical sciences by +providing N-dimensional variants of the core pandas data structures. It aims to +provide a pandas-like and pandas-compatible toolkit for analytics on multi- +dimensional arrays, rather than the tabular data for which pandas excels. + + +.. _ecosystem.out-of-core: + +Out-of-core +------------- + +`Blaze `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Blaze provides a standard API for doing computations with various +in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables, +PySpark. diff --git a/doc/source/faq.rst b/doc/source/faq.rst index a613d53218ce2..b93e5ae9c922a 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -24,6 +24,67 @@ Frequently Asked Questions (FAQ) options.display.mpl_style='default' from pandas.compat import lrange + +.. _df-memory-usage: + +DataFrame memory usage +~~~~~~~~~~~~~~~~~~~~~~ +As of pandas version 0.15.0, the memory usage of a dataframe (including +the index) is shown when accessing the ``info`` method of a dataframe. A +configuration option, ``display.memory_usage`` (see :ref:`options`), +specifies if the dataframe's memory usage will be displayed when +invoking the ``df.info()`` method. + +For example, the memory usage of the dataframe below is shown +when calling ``df.info()``: + +.. ipython:: python + + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + n = 5000 + data = dict([ (t, np.random.randint(100, size=n).astype(t)) + for t in dtypes]) + df = DataFrame(data) + df['categorical'] = df['object'].astype('category') + + df.info() + +The ``+`` symbol indicates that the true memory usage could be higher, because +pandas does not count the memory used by values in columns with +``dtype=object``. + +By default the display option is set to ``True`` but can be explicitly +overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. + +The memory usage of each column can be found by calling the ``memory_usage`` +method. This returns a Series with an index represented by column names +and memory usage of each column shown in bytes. For the dataframe above, +the memory usage of each column and the total memory usage of the +dataframe can be found with the memory_usage method: + +.. ipython:: python + + df.memory_usage() + + # total memory usage of dataframe + df.memory_usage().sum() + +By default the memory usage of the dataframe's index is not shown in the +returned Series, the memory usage of the index can be shown by passing +the ``index=True`` argument: + +.. ipython:: python + + df.memory_usage(index=True) + +The memory usage displayed by the ``info`` method utilizes the +``memory_usage`` method to determine the memory usage of a dataframe +while also formatting the output in human-readable units (base-2 +representation; i.e., 1KB = 1024 bytes). + +See also :ref:`Categorical Memory Usage `. + .. _ref-monkey-patching: Adding Features to your pandas Installation diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index eaccbfddc1f86..1b21c5d7291e5 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -233,7 +233,7 @@ however pass ``sort=False`` for potential speedups: GroupBy with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~ -With :ref:`hierarchically-indexed data `, it's quite +With :ref:`hierarchically-indexed data `, it's quite natural to group by one of the levels of the hierarchy. .. ipython:: python @@ -358,7 +358,7 @@ An obvious one is aggregation via the ``aggregate`` or equivalently ``agg`` meth As you can see, the result of the aggregation will have the group names as the new index along the grouped axis. In the case of multiple keys, the result is a -:ref:`MultiIndex ` by default, though this can be +:ref:`MultiIndex ` by default, though this can be changed by using the ``as_index`` option: .. ipython:: python @@ -869,7 +869,7 @@ This shows the first or last n rows from each group. Taking the nth row of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To select from a DataFrame or Series the nth item, use the nth method. This is a reduction method, and will return a single row (or no row) per group: +To select from a DataFrame or Series the nth item, use the nth method. This is a reduction method, and will return a single row (or no row) per group if you pass an int for n: .. ipython:: python @@ -880,7 +880,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a g.nth(-1) g.nth(1) -If you want to select the nth not-null method, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy. +If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy. .. ipython:: python @@ -904,6 +904,15 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh g.nth(0) g.nth(-1) +You can also select multiple rows from each group by specifying multiple nth values as a list of ints. + +.. ipython:: python + + business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, 4th, and last date index for each month + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 4e1d2b471d1c0..ee779715bcb95 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -122,17 +122,19 @@ See the package overview for more detail about what's in the library. cookbook dsintro basics + text options indexing + advanced computation missing_data groupby merging reshaping timeseries + timedeltas categorical visualization - rplot io remote_data enhancingperf @@ -148,5 +150,6 @@ See the package overview for more detail about what's in the library. {% endif -%} {%if not single -%} contributing + internals release {% endif -%} diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 04aa07a49ba8a..920be3672acd4 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -58,10 +58,12 @@ indexing. but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) +See the :ref:`MultiIndex / Advanced Indexing ` for ``MultiIndex`` and more advanced indexing documentation. + See the :ref:`cookbook` for some advanced strategies -Different Choices for Indexing (``loc``, ``iloc``, and ``ix``) --------------------------------------------------------------- +Different Choices for Indexing +------------------------------ .. versionadded:: 0.11.0 @@ -94,17 +96,18 @@ of multi-axis indexing. See more at :ref:`Selection by Position ` - ``.ix`` supports mixed integer and label based access. It is primarily label - based, but will fall back to integer positional access. ``.ix`` is the most - general and will support any of the inputs to ``.loc`` and ``.iloc``, as well - as support for floating point label schemes. ``.ix`` is especially useful - when dealing with mixed positional and label based hierarchical indexes. - As using integer slices with ``.ix`` have different behavior depending on - whether the slice is interpreted as position based or label based, it's - usually better to be explicit and use ``.iloc`` or ``.loc``. - - See more at :ref:`Advanced Indexing `, :ref:`Advanced - Hierarchical ` and :ref:`Fallback Indexing - ` + based, but will fall back to integer positional access unless the corresponding + axis is of integer type. ``.ix`` is the most general and will + support any of the inputs in ``.loc`` and ``.iloc``. ``.ix`` also supports floating point + label schemes. ``.ix`` is exceptionally useful when dealing with mixed positional + and label based hierachical indexes. + + However, when an axis is integer based, ONLY + label based access and not positional access is supported. + Thus, in such cases, it's usually better to be explicit and use ``.iloc`` or ``.loc``. + + See more at :ref:`Advanced Indexing ` and :ref:`Advanced + Hierarchical `. Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as @@ -228,11 +231,17 @@ new column. .. warning:: - You can use this access only if the index element is a valid python identifier, e.g. ``s.1`` is not allowed. - see `here for an explanation of valid identifiers + See `here for an explanation of valid identifiers `__. - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + - Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``, + ``major_axis``, ``minor_axis``, ``items``, ``labels``. + + - In any of these cases, standard indexing will still work, e.g. ``s['1']``, ``s['min']``, and ``s['index']`` will + access the corresponding element or column. + - The ``Series/Panel`` accesses are available starting in 0.13.0. If you are using the IPython environment, you may also use tab-completion to @@ -579,7 +588,7 @@ more complex criteria: df2[criterion & (df2['b'] == 'x')] Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, -and :ref:`Advanced Indexing ` you may select along more than one axis using boolean vectors combined with other indexing expressions. +and :ref:`Advanced Indexing ` you may select along more than one axis using boolean vectors combined with other indexing expressions. .. ipython:: python @@ -1078,71 +1087,6 @@ floating point values generated using ``numpy.random.randn()``. df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) df2 = df.copy() -Take Methods ------------- - -.. _indexing.take: - -Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides -the ``take`` method that retrieves elements along a given axis at the given -indices. The given indices must be either a list or an ndarray of integer -index positions. ``take`` will also accept negative integers as relative positions to the end of the object. - -.. ipython:: python - - index = Index(randint(0, 1000, 10)) - index - - positions = [0, 9, 3] - - index[positions] - index.take(positions) - - ser = Series(randn(10)) - - ser.ix[positions] - ser.take(positions) - -For DataFrames, the given indices should be a 1d list or ndarray that specifies -row or column positions. - -.. ipython:: python - - frm = DataFrame(randn(5, 3)) - - frm.take([1, 4, 3]) - - frm.take([0, 2], axis=1) - -It is important to note that the ``take`` method on pandas objects are not -intended to work on boolean indices and may return unexpected results. - -.. ipython:: python - - arr = randn(10) - arr.take([False, False, True, True]) - arr[[0, 1]] - - ser = Series(randn(10)) - ser.take([False, False, True, True]) - ser.ix[[0, 1]] - -Finally, as a small note on performance, because the ``take`` method handles -a narrower range of inputs, it can offer performance that is a good deal -faster than fancy indexing. - -.. ipython:: - - arr = randn(10000, 5) - indexer = np.arange(10000) - random.shuffle(indexer) - - timeit arr[indexer] - timeit arr.take(indexer, axis=0) - - ser = Series(arr[:, 0]) - timeit ser.ix[indexer] - timeit ser.take(indexer) Duplicate Data -------------- @@ -1183,229 +1127,231 @@ default value. s.get('a') # equivalent to s['a'] s.get('x', default=-1) -.. _indexing.advanced: - -Advanced Indexing with ``.ix`` ------------------------------- - -.. note:: +The :meth:`~pandas.DataFrame.select` Method +------------------------------------------- - The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite - explicit about indexing choices. ``.ix`` allows a great flexibility to - specify indexing locations by *label* and/or *integer position*. pandas will - attempt to use any passed *integer* as *label* locations first (like what - ``.loc`` would do, then to fall back on *positional* indexing, like what - ``.iloc`` would do). See :ref:`Fallback Indexing ` for - an example. +Another way to extract slices from an object is with the ``select`` method of +Series, DataFrame, and Panel. This method should be used only when there is no +more direct way. ``select`` takes a function which operates on labels along +``axis`` and returns a boolean. For instance: -The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by -Label `, and ``.iloc`` in :ref:`Selection by Position `. +.. ipython:: python -The ``.ix`` attribute takes the following inputs: + df.select(lambda x: x == 'A', axis=1) -- An integer or single label, e.g. ``5`` or ``'a'`` -- A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` -- A slice object with ints ``1:7`` or labels ``'a':'f'`` -- A boolean array +The :meth:`~pandas.DataFrame.lookup` Method +------------------------------------------- -We'll illustrate all of these methods. First, note that this provides a concise -way of reindexing on multiple axes at once: +Sometimes you want to extract a set of values given a sequence of row labels +and column labels, and the ``lookup`` method allows for this and returns a +numpy array. For instance, .. ipython:: python - subindex = dates[[3,4,5]] - df.reindex(index=subindex, columns=['C', 'B']) - df.ix[subindex, ['C', 'B']] + dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) + dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) -Assignment / setting values is possible when using ``ix``: +.. _indexing.class: -.. ipython:: python +Index objects +------------- - df2 = df.copy() - df2.ix[subindex, ['C', 'B']] = 0 - df2 +The pandas :class:`~pandas.Index` class and its subclasses can be viewed as +implementing an *ordered multiset*. Duplicates are allowed. However, if you try +to convert an :class:`~pandas.Index` object with duplicate entries into a +``set``, an exception will be raised. -Indexing with an array of integers can also be done: +:class:`~pandas.Index` also provides the infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create an +:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to +:class:`~pandas.Index`: .. ipython:: python - df.ix[[4,3,1]] - df.ix[dates[[4,3,1]]] - -**Slicing** has standard Python semantics for integer slices: - -.. ipython:: python + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index - df.ix[1:7, :2] +You can also pass a ``name`` to be stored in the index: -Slicing with labels is semantically slightly different because the slice start -and stop are **inclusive** in the label-based case: .. ipython:: python - date1, date2 = dates[[2, 4]] - print(date1, date2) - df.ix[date1:date2] - df['A'].ix[date1:date2] + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name -Getting and setting rows in a DataFrame, especially by their location, is much -easier: +The name, if set, will be shown in the console display: .. ipython:: python - df2 = df[:5].copy() - df2.ix[3] - df2.ix[3] = np.arange(len(df2.columns)) - df2 + index = Index(list(range(5)), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] -Column or row selection can be combined as you would expect with arrays of -labels or even boolean vectors: +Setting metadata +~~~~~~~~~~~~~~~~ -.. ipython:: python +.. versionadded:: 0.13.0 - df.ix[df['A'] > 0, 'B'] - df.ix[date1:date2, 'B'] - df.ix[date1, 'B'] +.. _indexing.set_metadata: -Slicing with labels is closely related to the ``truncate`` method which does -precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). +Indexes are "mostly immutable", but it is possible to set and change their +metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and +``labels``). -The :meth:`~pandas.DataFrame.select` Method -------------------------------------------- +You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` +to set these attributes directly. They default to returning a copy; however, +you can specify ``inplace=True`` to have the data change in place. -Another way to extract slices from an object is with the ``select`` method of -Series, DataFrame, and Panel. This method should be used only when there is no -more direct way. ``select`` takes a function which operates on labels along -``axis`` and returns a boolean. For instance: +See :ref:`Advanced Indexing ` for usage of MultiIndexes. .. ipython:: python - df.select(lambda x: x == 'A', axis=1) + ind = Index([1, 2, 3]) + ind.rename("apple") + ind + ind.set_names(["apple"], inplace=True) + ind.name = "bob" + ind -The :meth:`~pandas.DataFrame.lookup` Method -------------------------------------------- +.. versionadded:: 0.15.0 -Sometimes you want to extract a set of values given a sequence of row labels -and column labels, and the ``lookup`` method allows for this and returns a -numpy array. For instance, +``set_names``, ``set_levels``, and ``set_labels`` also take an optional +`level`` argument .. ipython:: python - dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) - dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) -.. _indexing.float64index: - -Float64Index ------------- + index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index + index.levels[1] + index.set_levels(["a", "b"], level=1) -.. note:: +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype - array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype - array. Using a ``float64`` dtype in the backend speeds up arithmetic - operations by about 30x and boolean indexing operations on the - ``Float64Index`` itself are about 2x as fast. +.. _indexing.set_ops: +.. warning:: -.. versionadded:: 0.13.0 + In 0.15.0. the set operations ``+`` and ``-`` were deprecated in order to provide these for numeric type operations on certain + index types. ``+`` can be replace by ``.union()`` or ``|``, and ``-`` by ``.difference()``. -By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation. -This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the -same. +The two main operations are ``union (|)``, ``intersection (&)`` +These can be directly called as instance methods or used via overloaded +operators. Difference is provided via the ``.difference()`` method. .. ipython:: python - indexf = Index([1.5, 2, 3, 4.5, 5]) - indexf - sf = Series(range(5),index=indexf) - sf + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a | b + a & b + a.difference(b) -Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) +Also available is the ``sym_diff (^)`` operation, which returns elements +that appear in either ``idx1`` or ``idx2`` but not both. This is +equivalent to the Index created by ``idx1.difference(idx2).union(idx2.difference(idx1))``, +with duplicates dropped. .. ipython:: python - sf[3] - sf[3.0] - sf.ix[3] - sf.ix[3.0] - sf.loc[3] - sf.loc[3.0] + idx1 = Index([1, 2, 3, 4]) + idx2 = Index([2, 3, 4, 5]) + idx1.sym_diff(idx2) + idx1 ^ idx2 -The only positional indexing is via ``iloc`` +Set / Reset Index +----------------- -.. ipython:: python +Occasionally you will load or create a data set into a DataFrame and want to +add an index after you've already done so. There are a couple of different +ways. - sf.iloc[3] +Set an index +~~~~~~~~~~~~ -A scalar index that is not found will raise ``KeyError`` +.. _indexing.set_index: -Slicing is ALWAYS on the values of the index, for ``[],ix,loc`` and ALWAYS positional with ``iloc`` +DataFrame has a ``set_index`` method which takes a column name (for a regular +``Index``) or a list of column names (for a ``MultiIndex``), to create a new, +indexed DataFrame: .. ipython:: python + :suppress: - sf[2:4] - sf.ix[2:4] - sf.loc[2:4] - sf.iloc[2:4] - -In float indexes, slicing using floats is allowed + data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'two'], + 'c' : ['z', 'y', 'x', 'w'], + 'd' : [1., 2., 3, 4]}) .. ipython:: python - sf[2.1:4.6] - sf.loc[2.1:4.6] + data + indexed1 = data.set_index('c') + indexed1 + indexed2 = data.set_index(['a', 'b']) + indexed2 -In non-float indexes, slicing using floats will raise a ``TypeError`` +The ``append`` keyword option allow you to keep the existing index and append +the given columns to a MultiIndex: -.. code-block:: python +.. ipython:: python - In [1]: Series(range(5))[3.5] - TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) + frame = data.set_index('c', drop=False) + frame = frame.set_index(['a', 'b'], append=True) + frame - In [1]: Series(range(5))[3.5:4.5] - TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) +Other options in ``set_index`` allow you not drop the index columns or to add +the index in-place (without creating a new object): -Using a scalar float indexer will be deprecated in a future version, but is allowed for now. +.. ipython:: python -.. code-block:: python + data.set_index('c', drop=False) + data.set_index(['a', 'b'], inplace=True) + data - In [3]: Series(range(5))[3.0] - Out[3]: 3 +Reset the index +~~~~~~~~~~~~~~~ -Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat -irregular timedelta-like indexing scheme, but the data is recorded as floats. This could for -example be millisecond offsets. +As a convenience, there is a new function on DataFrame called ``reset_index`` +which transfers the index values into the DataFrame's columns and sets a simple +integer index. This is the inverse operation to ``set_index`` .. ipython:: python - dfir = concat([DataFrame(randn(5,2), - index=np.arange(5) * 250.0, - columns=list('AB')), - DataFrame(randn(6,2), - index=np.arange(4,10) * 250.1, - columns=list('AB'))]) - dfir + data + data.reset_index() + +The output is more similar to a SQL table or a record array. The names for the +columns derived from the index are the ones stored in the ``names`` attribute. -Selection operations then will always work on a value basis, for all selection operators. +You can use the ``level`` keyword to remove only a portion of the index: .. ipython:: python - dfir[0:1000.4] - dfir.loc[0:1001,'A'] - dfir.loc[1000.4] + frame + frame.reset_index(level=1) -You could then easily pick out the first 1 second (1000 ms) of data then. -.. ipython:: python +``reset_index`` takes an optional parameter ``drop`` which if true simply +discards the index, instead of putting index values in the DataFrame's columns. - dfir[0:1000] +.. note:: -Of course if you need integer based selection, then use ``iloc`` + The ``reset_index`` method used to be called ``delevel`` which is now + deprecated. -.. ipython:: python +Adding an ad hoc index +~~~~~~~~~~~~~~~~~~~~~~ + +If you create an index yourself, you can just assign it to the ``index`` field: - dfir.iloc[0:5] +.. code-block:: python + + data.index = index .. _indexing.view_versus_copy: @@ -1539,795 +1485,3 @@ This will **not** work at all, and so should be avoided reported. -Fallback indexing ------------------ - -.. _indexing.fallback: - -Float indexes should be used only with caution. If you have a float indexed -``DataFrame`` and try to select using an integer, the row that pandas returns -might not be what you expect. pandas first attempts to use the *integer* -as a *label* location, but fails to find a match (because the types -are not equal). pandas then falls back to back to positional indexing. - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(4,4), - columns=list('ABCD'), index=[1.0, 2.0, 3.0, 4.0]) - df - df.ix[1] - -To select the row you do expect, instead use a float label or -use ``iloc``. - -.. ipython:: python - - df.ix[1.0] - df.iloc[0] - -Instead of using a float index, it is often better to -convert to an integer index: - -.. ipython:: python - - df_new = df.reset_index() - df_new[df_new['index'] == 1.0] - # now you can also do "float selection" - df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] - - -.. _indexing.class: - -Index objects -------------- - -The pandas :class:`~pandas.Index` class and its subclasses can be viewed as -implementing an *ordered multiset*. Duplicates are allowed. However, if you try -to convert an :class:`~pandas.Index` object with duplicate entries into a -``set``, an exception will be raised. - -:class:`~pandas.Index` also provides the infrastructure necessary for -lookups, data alignment, and reindexing. The easiest way to create an -:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to -:class:`~pandas.Index`: - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b']) - index - 'd' in index - -You can also pass a ``name`` to be stored in the index: - - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b'], name='something') - index.name - -Starting with pandas 0.5, the name, if set, will be shown in the console -display: - -.. ipython:: python - - index = Index(list(range(5)), name='rows') - columns = Index(['A', 'B', 'C'], name='cols') - df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) - df - df['A'] - - -Set operations on Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.set_ops: - -The three main operations are ``union (|)``, ``intersection (&)``, and ``diff -(-)``. These can be directly called as instance methods or used via overloaded -operators: - -.. ipython:: python - - a = Index(['c', 'b', 'a']) - b = Index(['c', 'e', 'd']) - a.union(b) - a | b - a & b - a - b - -Also available is the ``sym_diff (^)`` operation, which returns elements -that appear in either ``idx1`` or ``idx2`` but not both. This is -equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``, -with duplicates dropped. - -.. ipython:: python - - idx1 = Index([1, 2, 3, 4]) - idx2 = Index([2, 3, 4, 5]) - idx1.sym_diff(idx2) - idx1 ^ idx2 - -.. _indexing.hierarchical: - -Hierarchical indexing (MultiIndex) ----------------------------------- - -Hierarchical indexing (also referred to as "multi-level" indexing) is brand new -in the pandas 0.4 release. It is very exciting as it opens the door to some -quite sophisticated data analysis and manipulation, especially for working with -higher dimensional data. In essence, it enables you to store and manipulate -data with an arbitrary number of dimensions in lower dimensional data -structures like Series (1d) and DataFrame (2d). - -In this section, we will show what exactly we mean by "hierarchical" indexing -and how it integrates with the all of the pandas indexing functionality -described above and in prior sections. Later, when discussing :ref:`group by -` and :ref:`pivoting and reshaping data `, we'll show -non-trivial applications to illustrate how it aids in structuring data for -analysis. - -See the :ref:`cookbook` for some advanced strategies - -Creating a MultiIndex (hierarchical index) object -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``MultiIndex`` object is the hierarchical analogue of the standard -``Index`` object which typically stores the axis labels in pandas objects. You -can think of ``MultiIndex`` an array of tuples where each tuple is unique. A -``MultiIndex`` can be created from a list of arrays (using -``MultiIndex.from_arrays``), an array of tuples (using -``MultiIndex.from_tuples``), or a crossed set of iterables (using -``MultiIndex.from_product``). The ``Index`` constructor will attempt to return -a ``MultiIndex`` when it is passed a list of tuples. The following examples -demo different ways to initialize MultiIndexes. - - -.. ipython:: python - - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - tuples = list(zip(*arrays)) - tuples - - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) - index - - s = Series(randn(8), index=index) - s - -When you want every pairing of the elements in two iterables, it can be easier -to use the ``MultiIndex.from_product`` function: - -.. ipython:: python - - iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - MultiIndex.from_product(iterables, names=['first', 'second']) - -As a convenience, you can pass a list of arrays directly into Series or -DataFrame to construct a MultiIndex automatically: - -.. ipython:: python - - arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']) - , - np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']) - ] - s = Series(randn(8), index=arrays) - s - df = DataFrame(randn(8, 4), index=arrays) - df - -All of the ``MultiIndex`` constructors accept a ``names`` argument which stores -string names for the levels themselves. If no names are provided, ``None`` will -be assigned: - -.. ipython:: python - - df.index.names - -This index can back any axis of a pandas object, and the number of **levels** -of the index is up to you: - -.. ipython:: python - - df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) - df - DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) - -We've "sparsified" the higher levels of the indexes to make the console output a -bit easier on the eyes. - -It's worth keeping in mind that there's nothing preventing you from using -tuples as atomic labels on an axis: - -.. ipython:: python - - Series(randn(8), index=tuples) - -The reason that the ``MultiIndex`` matters is that it can allow you to do -grouping, selection, and reshaping operations as we will describe below and in -subsequent areas of the documentation. As you will see in later sections, you -can find yourself working with hierarchically-indexed data without creating a -``MultiIndex`` explicitly yourself. However, when loading data from a file, you -may wish to generate your own ``MultiIndex`` when preparing the data set. - -Note that how the index is displayed by be controlled using the -``multi_sparse`` option in ``pandas.set_printoptions``: - -.. ipython:: python - - pd.set_option('display.multi_sparse', False) - df - pd.set_option('display.multi_sparse', True) - -Reconstructing the level labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.get_level_values: - -The method ``get_level_values`` will return a vector of the labels for each -location at a particular level: - -.. ipython:: python - - index.get_level_values(0) - index.get_level_values('second') - - -Basic indexing on axis with MultiIndex -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One of the important features of hierarchical indexing is that you can select -data by a "partial" label identifying a subgroup in the data. **Partial** -selection "drops" levels of the hierarchical index in the result in a -completely analogous way to selecting a column in a regular DataFrame: - -.. ipython:: python - - df['bar'] - df['bar', 'one'] - df['bar']['one'] - s['qux'] - -See :ref:`Cross-section with hierarchical index ` for how to select -on a deeper level. - - -Data alignment and using ``reindex`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Operations between differently-indexed objects having ``MultiIndex`` on the -axes will work as you expect; data alignment will work the same as an Index of -tuples: - -.. ipython:: python - - s + s[:-2] - s + s[::2] - -``reindex`` can be called with another ``MultiIndex`` or even a list or array -of tuples: - -.. ipython:: python - - s.reindex(index[:3]) - s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) - -.. _indexing.advanced_hierarchical: - -Advanced indexing with hierarchical index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a -bit challenging, but we've made every effort to do so. for example the -following works as you would expect: - -.. ipython:: python - - df = df.T - df - df.loc['bar'] - df.loc['bar', 'two'] - -"Partial" slicing also works quite nicely. - -.. ipython:: python - - df.loc['baz':'foo'] - -You can slice with a 'range' of values, by providing a slice of tuples. - -.. ipython:: python - - df.loc[('baz', 'two'):('qux', 'one')] - df.loc[('baz', 'two'):'foo'] - -Passing a list of labels or tuples works similar to reindexing: - -.. ipython:: python - - df.ix[[('bar', 'two'), ('qux', 'one')]] - -.. _indexing.mi_slicers: - -Multiindexing using slicers -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 0.14.0 - -In 0.14.0 we added a new way to slice multi-indexed objects. -You can slice a multi-index by providing multiple indexers. - -You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, -including slices, lists of labels, labels, and boolean indexers. - -You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the -*deeper* levels, they will be implied as ``slice(None)``. - -As usual, **both sides** of the slicers are included as this is label indexing. - -.. warning:: - - You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and - for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted - as indexing *both* axes, rather than into say the MuliIndex for the rows. - - You should do this: - - .. code-block:: python - - df.loc[(slice('A1','A3'),.....),:] - - rather than this: - - .. code-block:: python - - df.loc[(slice('A1','A3'),.....)] - -.. warning:: - - You will need to make sure that the selection axes are fully lexsorted! - -.. ipython:: python - - def mklbl(prefix,n): - return ["%s%s" % (prefix,i) for i in range(n)] - - miindex = MultiIndex.from_product([mklbl('A',4), - mklbl('B',2), - mklbl('C',4), - mklbl('D',2)]) - micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), - ('b','foo'),('b','bah')], - names=['lvl0', 'lvl1']) - dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), - index=miindex, - columns=micolumns).sortlevel().sortlevel(axis=1) - dfmi - -Basic multi-index slicing using slices, lists, and labels. - -.. ipython:: python - - dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] - -You can use a ``pd.IndexSlice`` to shortcut the creation of these slices - -.. ipython:: python - - idx = pd.IndexSlice - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] - -It is possible to perform quite complicated selections using this method on multiple -axes at the same time. - -.. ipython:: python - - dfmi.loc['A1',(slice(None),'foo')] - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] - -Using a boolean indexer you can provide selection related to the *values*. - -.. ipython:: python - - mask = dfmi[('a','foo')]>200 - dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] - -You can also specify the ``axis`` argument to ``.loc`` to interpret the passed -slicers on a single axis. - -.. ipython:: python - - dfmi.loc(axis=0)[:,:,['C1','C3']] - -Furthermore you can *set* the values using these methods - -.. ipython:: python - - df2 = dfmi.copy() - df2.loc(axis=0)[:,:,['C1','C3']] = -10 - df2 - -You can use a right-hand-side of an alignable object as well. - -.. ipython:: python - - df2 = dfmi.copy() - df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 - df2 - -.. _indexing.xs: - -Cross-section with hierarchical index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``xs`` method of ``DataFrame`` additionally takes a level argument to make -selecting data at a particular level of a MultiIndex easier. - -.. ipython:: python - - df.xs('one', level='second') - -.. ipython:: python - - # using the slicers (new in 0.14.0) - df.loc[(slice(None),'one'),:] - -You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by -providing the axis argument - -.. ipython:: python - - df = df.T - df.xs('one', level='second', axis=1) - -.. ipython:: python - - # using the slicers (new in 0.14.0) - df.loc[:,(slice(None),'one')] - -:meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys - -.. ipython:: python - - df.xs(('one', 'bar'), level=('second', 'first'), axis=1) - -.. ipython:: python - - # using the slicers (new in 0.14.0) - df.loc[:,('bar','one')] - -.. versionadded:: 0.13.0 - -You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain -the level that was selected - -.. ipython:: python - - df.xs('one', level='second', axis=1, drop_level=False) - -versus the result with ``drop_level=True`` (the default value) - -.. ipython:: python - - df.xs('one', level='second', axis=1, drop_level=True) - -.. ipython:: python - :suppress: - - df = df.T - -.. _indexing.advanced_reindex: - -Advanced reindexing and alignment with hierarchical index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The parameter ``level`` has been added to the ``reindex`` and ``align`` methods -of pandas objects. This is useful to broadcast values across a level. For -instance: - -.. ipython:: python - - midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], - labels=[[1,1,0,0],[1,0,1,0]]) - df = DataFrame(randn(4,2), index=midx) - print(df) - df2 = df.mean(level=0) - print(df2) - print(df2.reindex(df.index, level=0)) - df_aligned, df2_aligned = df.align(df2, level=0) - print(df_aligned) - print(df2_aligned) - - -The need for sortedness with :class:`~pandas.MultiIndex` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Caveat emptor**: the present implementation of ``MultiIndex`` requires that -the labels be sorted for some of the slicing / indexing routines to work -correctly. You can think about breaking the axis into unique groups, where at -the hierarchical level of interest, each distinct group shares a label, but no -two have the same label. However, the ``MultiIndex`` does not enforce this: -**you are responsible for ensuring that things are properly sorted**. There is -an important new method ``sortlevel`` to sort an axis within a ``MultiIndex`` -so that its labels are grouped and sorted by the original ordering of the -associated factor at that level. Note that this does not necessarily mean the -labels will be sorted lexicographically! - -.. ipython:: python - - import random; random.shuffle(tuples) - s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) - s - s.sortlevel(0) - s.sortlevel(1) - -.. _indexing.sortlevel_byname: - -Note, you may also pass a level name to ``sortlevel`` if the MultiIndex levels -are named. - -.. ipython:: python - - s.index.set_names(['L1', 'L2'], inplace=True) - s.sortlevel(level='L1') - s.sortlevel(level='L2') - -Some indexing will work even if the data are not sorted, but will be rather -inefficient and will also return a copy of the data rather than a view: - -.. ipython:: python - - s['qux'] - s.sortlevel(1)['qux'] - -On higher dimensional objects, you can sort any of the other axes by level if -they have a MultiIndex: - -.. ipython:: python - - df.T.sortlevel(1, axis=1) - -The ``MultiIndex`` object has code to **explicity check the sort depth**. Thus, -if you try to index at a depth at which the index is not sorted, it will raise -an exception. Here is a concrete example to illustrate this: - -.. ipython:: python - - tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] - idx = MultiIndex.from_tuples(tuples) - idx.lexsort_depth - - reordered = idx[[1, 0, 3, 2]] - reordered.lexsort_depth - - s = Series(randn(4), index=reordered) - s.ix['a':'a'] - -However: - -:: - - >>> s.ix[('a', 'b'):('b', 'a')] - Traceback (most recent call last) - ... - KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) - -Swapping levels with :meth:`~pandas.MultiIndex.swaplevel` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``swaplevel`` function can switch the order of two levels: - -.. ipython:: python - - df[:5] - df[:5].swaplevel(0, 1, axis=0) - -.. _indexing.reorderlevels: - -Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``reorder_levels`` function generalizes the ``swaplevel`` function, -allowing you to permute the hierarchical index levels in one step: - -.. ipython:: python - - df[:5].reorder_levels([1,0], axis=0) - - -Some gory internal details -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Internally, the ``MultiIndex`` consists of a few things: the **levels**, the -integer **labels**, and the level **names**: - -.. ipython:: python - - index - index.levels - index.labels - index.names - -You can probably guess that the labels determine which unique element is -identified with that location at each layer of the index. It's important to -note that sortedness is determined **solely** from the integer labels and does -not check (or care) whether the levels themselves are sorted. Fortunately, the -constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but -if you compute the levels and labels yourself, please be careful. - - -Setting index metadata (``name(s)``, ``levels``, ``labels``) ------------------------------------------------------------- - -.. versionadded:: 0.13.0 - -.. _indexing.set_metadata: - -Indexes are "mostly immutable", but it is possible to set and change their -metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``labels``). - -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` -to set these attributes directly. They default to returning a copy; however, -you can specify ``inplace=True`` to have the data change in place. - -.. ipython:: python - - ind = Index([1, 2, 3]) - ind.rename("apple") - ind - ind.set_names(["apple"], inplace=True) - ind.name = "bob" - ind - -.. versionadded:: 0.15.0 - -``set_names``, ``set_levels``, and ``set_labels`` also take an optional -`level`` argument - -.. ipython:: python - - index - index.levels[1] - index.set_levels(["a", "b"], level=1) - -Adding an index to an existing DataFrame ----------------------------------------- - -Occasionally you will load or create a data set into a DataFrame and want to -add an index after you've already done so. There are a couple of different -ways. - -Add an index using DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.set_index: - -DataFrame has a ``set_index`` method which takes a column name (for a regular -``Index``) or a list of column names (for a ``MultiIndex``), to create a new, -indexed DataFrame: - -.. ipython:: python - :suppress: - - data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], - 'b' : ['one', 'two', 'one', 'two'], - 'c' : ['z', 'y', 'x', 'w'], - 'd' : [1., 2., 3, 4]}) - -.. ipython:: python - - data - indexed1 = data.set_index('c') - indexed1 - indexed2 = data.set_index(['a', 'b']) - indexed2 - -The ``append`` keyword option allow you to keep the existing index and append -the given columns to a MultiIndex: - -.. ipython:: python - - frame = data.set_index('c', drop=False) - frame = frame.set_index(['a', 'b'], append=True) - frame - -Other options in ``set_index`` allow you not drop the index columns or to add -the index in-place (without creating a new object): - -.. ipython:: python - - data.set_index('c', drop=False) - data.set_index(['a', 'b'], inplace=True) - data - -Remove / reset the index, ``reset_index`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -As a convenience, there is a new function on DataFrame called ``reset_index`` -which transfers the index values into the DataFrame's columns and sets a simple -integer index. This is the inverse operation to ``set_index`` - -.. ipython:: python - - data - data.reset_index() - -The output is more similar to a SQL table or a record array. The names for the -columns derived from the index are the ones stored in the ``names`` attribute. - -You can use the ``level`` keyword to remove only a portion of the index: - -.. ipython:: python - - frame - frame.reset_index(level=1) - - -``reset_index`` takes an optional parameter ``drop`` which if true simply -discards the index, instead of putting index values in the DataFrame's columns. - -.. note:: - - The ``reset_index`` method used to be called ``delevel`` which is now - deprecated. - -Adding an ad hoc index -~~~~~~~~~~~~~~~~~~~~~~ - -If you create an index yourself, you can just assign it to the ``index`` field: - -.. code-block:: python - - data.index = index - -Indexing internal details -------------------------- - -.. note:: - - The following is largely relevant for those actually working on the pandas - codebase. The source code is still the best place to look at the specifics - of how things are implemented. - -In pandas there are a few objects implemented which can serve as valid -containers for the axis labels: - - - ``Index``: the generic "ordered set" object, an ndarray of object dtype - assuming nothing about its contents. The labels must be hashable (and - likely immutable) and unique. Populates a dict of label to location in - Cython to do :math:`O(1)` lookups. - - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer - data, such as time stamps - - ``MultiIndex``: the standard hierarchical index object - - ``PeriodIndex``: An Index object with Period elements - - ``DatetimeIndex``: An Index object with Timestamp elements - - ``date_range``: fixed frequency date range generated from a time rule or - DateOffset. An ndarray of Python datetime objects - -The motivation for having an ``Index`` class in the first place was to enable -different implementations of indexing. This means that it's possible for you, -the user, to implement a custom ``Index`` subclass that may be better suited to -a particular application than the ones provided in pandas. - -From an internal implementation point of view, the relevant methods that an -``Index`` must define are one or more of the following (depending on how -incompatible the new object internals are with the ``Index`` functions): - - - ``get_loc``: returns an "indexer" (an integer, or in some cases a - slice object) for a label - - ``slice_locs``: returns the "range" to slice between two labels - - ``get_indexer``: Computes the indexing vector for reindexing / data - alignment purposes. See the source / docstrings for more on this - - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data - alignment purposes when the index is non-unique. See the source / docstrings - for more on this - - ``reindex``: Does any pre-conversion of the input index then calls - ``get_indexer`` - - ``union``, ``intersection``: computes the union or intersection of two - Index objects - - ``insert``: Inserts a new label into an Index, yielding a new object - - ``delete``: Delete a label, yielding a new object - - ``drop``: Deletes a set of labels - - ``take``: Analogous to ndarray.take diff --git a/doc/source/install.rst b/doc/source/install.rst index fb22a86096b59..0331e8a47903c 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -247,22 +247,22 @@ installed), make sure you have `nose Dependencies ------------ - * `NumPy `__: 1.7.0 or higher - * `python-dateutil `__ 1.5 - * `pytz `__ - * Needed for time zone support +* `NumPy `__: 1.7.0 or higher +* `python-dateutil `__ 1.5 +* `pytz `__ + * Needed for time zone support .. _install.recommended_dependencies: Recommended Dependencies ~~~~~~~~~~~~~~~~~~~~~~~~ - * `numexpr `__: for accelerating certain numerical operations. - ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.1 or higher. +* `numexpr `__: for accelerating certain numerical operations. + ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. + If installed, must be Version 2.1 or higher. - * `bottleneck `__: for accelerating certain types of ``nan`` - evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. +* `bottleneck `__: for accelerating certain types of ``nan`` + evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. .. note:: @@ -275,69 +275,72 @@ Recommended Dependencies Optional Dependencies ~~~~~~~~~~~~~~~~~~~~~ - * `Cython `__: Only necessary to build development - version. Version 0.17.1 or higher. - * `SciPy `__: miscellaneous statistical functions - * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required. - * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. - * `matplotlib `__: for plotting - * `statsmodels `__ - * Needed for parts of :mod:`pandas.stats` - * `openpyxl `__, `xlrd/xlwt `__ - * openpyxl version 1.6.1 or higher, but lower than 2.0.0 - * Needed for Excel I/O - * `XlsxWriter `__ - * Alternative Excel writer. - * `boto `__: necessary for Amazon S3 - access. - * One of `PyQt4 - `__, `PySide - `__, `pygtk - `__, `xsel - `__, or `xclip - `__: necessary to use - :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux - distributions will have xclip and/or xsel immediately available for - installation. - * Google's `python-gflags` and `google-api-python-client` - * Needed for :mod:`~pandas.io.gbq` - * `httplib2` - * Needed for :mod:`~pandas.io.gbq` - * One of the following combinations of libraries is needed to use the - top-level :func:`~pandas.io.html.read_html` function: - - * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is - okay.) - * `BeautifulSoup4`_ and `lxml`_ - * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ - * Only `lxml`_, although see :ref:`HTML reading gotchas ` - for reasons as to why you should probably **not** take this approach. - - .. warning:: - - * if you install `BeautifulSoup4`_ you must install either - `lxml`_ or `html5lib`_ or both. - :func:`~pandas.io.html.read_html` will **not** work with *only* - `BeautifulSoup4`_ installed. - * You are highly encouraged to read :ref:`HTML reading gotchas - `. It explains issues surrounding the installation and - usage of the above three libraries - * You may need to install an older version of `BeautifulSoup4`_: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian - * Additionally, if you're using `Anaconda`_ you should definitely - read :ref:`the gotchas about HTML parsing libraries ` - - .. note:: - - * if you're on a system with ``apt-get`` you can do - - .. code-block:: sh - - sudo apt-get build-dep python-lxml - - to get the necessary dependencies for installation of `lxml`_. This - will prevent further headaches down the line. +* `Cython `__: Only necessary to build development + version. Version 0.19.1 or higher. +* `SciPy `__: miscellaneous statistical functions +* `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required. +* `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. +* `matplotlib `__: for plotting +* `statsmodels `__ + * Needed for parts of :mod:`pandas.stats` +* `openpyxl `__, `xlrd/xlwt `__ + * openpyxl version 1.6.1 or higher, but lower than 2.0.0 + * Needed for Excel I/O +* `XlsxWriter `__ + * Alternative Excel writer. +* `boto `__: necessary for Amazon S3 + access. +* One of `PyQt4 + `__, `PySide + `__, `pygtk + `__, `xsel + `__, or `xclip + `__: necessary to use + :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux + distributions will have xclip and/or xsel immediately available for + installation. +* Google's `python-gflags `__ + and `google-api-python-client `__ + * Needed for :mod:`~pandas.io.gbq` +* `setuptools `__ + * Needed for :mod:`~pandas.io.gbq` (specifically, it utilizes `pkg_resources`) +* `httplib2 `__ + * Needed for :mod:`~pandas.io.gbq` +* One of the following combinations of libraries is needed to use the + top-level :func:`~pandas.io.html.read_html` function: + + * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is + okay.) + * `BeautifulSoup4`_ and `lxml`_ + * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ + * Only `lxml`_, although see :ref:`HTML reading gotchas ` + for reasons as to why you should probably **not** take this approach. + + .. warning:: + + * if you install `BeautifulSoup4`_ you must install either + `lxml`_ or `html5lib`_ or both. + :func:`~pandas.io.html.read_html` will **not** work with *only* + `BeautifulSoup4`_ installed. + * You are highly encouraged to read :ref:`HTML reading gotchas + `. It explains issues surrounding the installation and + usage of the above three libraries + * You may need to install an older version of `BeautifulSoup4`_: + - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and + 32-bit Ubuntu/Debian + * Additionally, if you're using `Anaconda`_ you should definitely + read :ref:`the gotchas about HTML parsing libraries ` + + .. note:: + + * if you're on a system with ``apt-get`` you can do + + .. code-block:: sh + + sudo apt-get build-dep python-lxml + + to get the necessary dependencies for installation of `lxml`_. This + will prevent further headaches down the line. .. _html5lib: https://github.com/html5lib/html5lib-python diff --git a/doc/source/internals.rst b/doc/source/internals.rst new file mode 100644 index 0000000000000..9418ca5265f1a --- /dev/null +++ b/doc/source/internals.rst @@ -0,0 +1,97 @@ +.. _internals: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import range, zip + +********* +Internals +********* + +This section will provide a look into some of pandas internals. + +Indexing +-------- + +In pandas there are a few objects implemented which can serve as valid +containers for the axis labels: + +- ``Index``: the generic "ordered set" object, an ndarray of object dtype + assuming nothing about its contents. The labels must be hashable (and + likely immutable) and unique. Populates a dict of label to location in + Cython to do ``O(1)`` lookups. +- ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer + data, such as time stamps +- ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data +- ``MultiIndex``: the standard hierarchical index object +- ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values) +- ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values) +- ``PeriodIndex``: An Index object with Period elements + +These are range generates to make the creation of a regular index easy: + +- ``date_range``: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of Python datetime objects +- ``period_range``: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of ``Period`` objects, representing Timespans + +The motivation for having an ``Index`` class in the first place was to enable +different implementations of indexing. This means that it's possible for you, +the user, to implement a custom ``Index`` subclass that may be better suited to +a particular application than the ones provided in pandas. + +From an internal implementation point of view, the relevant methods that an +``Index`` must define are one or more of the following (depending on how +incompatible the new object internals are with the ``Index`` functions): + +- ``get_loc``: returns an "indexer" (an integer, or in some cases a + slice object) for a label +- ``slice_locs``: returns the "range" to slice between two labels +- ``get_indexer``: Computes the indexing vector for reindexing / data + alignment purposes. See the source / docstrings for more on this +- ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data + alignment purposes when the index is non-unique. See the source / docstrings + for more on this +- ``reindex``: Does any pre-conversion of the input index then calls + ``get_indexer`` +- ``union``, ``intersection``: computes the union or intersection of two + Index objects +- ``insert``: Inserts a new label into an Index, yielding a new object +- ``delete``: Delete a label, yielding a new object +- ``drop``: Deletes a set of labels +- ``take``: Analogous to ndarray.take + +MultiIndex +~~~~~~~~~~ + +Internally, the ``MultiIndex`` consists of a few things: the **levels**, the +integer **labels**, and the level **names**: + +.. ipython:: python + + index = MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second']) + index + index.levels + index.labels + index.names + +You can probably guess that the labels determine which unique element is +identified with that location at each layer of the index. It's important to +note that sortedness is determined **solely** from the integer labels and does +not check (or care) whether the levels themselves are sorted. Fortunately, the +constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but +if you compute the levels and labels yourself, please be careful. + + diff --git a/doc/source/io.rst b/doc/source/io.rst index baf684056e169..e0c6c79380bea 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -100,8 +100,10 @@ They can take a number of arguments: a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example are skipped). Note that this parameter - ignores commented lines, so header=0 denotes the first line of - data rather than the first line of the file. + ignores commented lines and empty lines if ``skip_blank_lines=True`` (the default), + so header=0 denotes the first line of data rather than the first line of the file. + - ``skip_blank_lines``: whether to skip over blank lines rather than interpreting + them as NaN values - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows - ``index_col``: column number, column name, or list of column numbers/names, @@ -149,7 +151,7 @@ They can take a number of arguments: - ``escapechar`` : string, to specify how to escape quoted data - ``comment``: Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter - must be a single character. Also, fully commented lines + must be a single character. Like empty lines, fully commented lines are ignored by the parameter `header` but not by `skiprows`. For example, if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will result in '1,2,3' being treated as the header. @@ -165,7 +167,9 @@ They can take a number of arguments: - ``converters``: a dictionary of functions for converting values in certain columns, where keys are either integers or column labels - ``encoding``: a string representing the encoding to use for decoding - unicode data, e.g. ``'utf-8``` or ``'latin-1'``. + unicode data, e.g. ``'utf-8``` or ``'latin-1'``. `Full list of Python + standard encodings + `_ - ``verbose``: show number of NA values inserted in non-numeric columns - ``squeeze``: if True then output with only one column is turned into Series - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines ` @@ -174,7 +178,12 @@ They can take a number of arguments: - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' - ``tupleize_cols``: boolean, default False, if False, convert a list of tuples - to a multi-index of columns, otherwise, leave the column index as a list of tuples + to a multi-index of columns, otherwise, leave the column index as a list of + tuples + - ``float_precision`` : string, default None. Specifies which converter the C + engine should use for floating-point values. The options are None for the + ordinary converter, 'high' for the high-precision converter, and + 'round_trip' for the round-trip converter. .. ipython:: python :suppress: @@ -259,27 +268,6 @@ after a delimiter: print(data) pd.read_csv(StringIO(data), skipinitialspace=True) -Moreover, ``read_csv`` ignores any completely commented lines: - -.. ipython:: python - - data = 'a,b,c\n# commented line\n1,2,3\n#another comment\n4,5,6' - print(data) - pd.read_csv(StringIO(data), comment='#') - -.. note:: - - The presence of ignored lines might create ambiguities involving line numbers; - the parameter ``header`` uses row numbers (ignoring commented - lines), while ``skiprows`` uses line numbers (including commented lines): - - .. ipython:: python - - data = '#comment\na,b,c\nA,B,C\n1,2,3' - pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' - pd.read_csv(StringIO(data), comment='#', skiprows=2) - The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to integer dtype without altering the contents, it will do so. Any non-numeric @@ -356,6 +344,50 @@ file, either using the column names or position numbers: pd.read_csv(StringIO(data), usecols=['b', 'd']) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) +.. _io.skiplines: + +Ignoring line comments and empty lines +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If the ``comment`` parameter is specified, then completely commented lines will +be ignored. By default, completely blank lines will be ignored as well. Both of +these are API changes introduced in version 0.15. + +.. ipython:: python + + data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + print(data) + pd.read_csv(StringIO(data), comment='#') + +If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: + +.. ipython:: python + + data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + pd.read_csv(StringIO(data), skip_blank_lines=False) + +.. warning:: + + The presence of ignored lines might create ambiguities involving line numbers; + the parameter ``header`` uses row numbers (ignoring commented/empty + lines), while ``skiprows`` uses line numbers (including commented/empty lines): + + .. ipython:: python + + data = '#comment\na,b,c\nA,B,C\n1,2,3' + pd.read_csv(StringIO(data), comment='#', header=1) + data = 'A,B,C\n#comment\na,b,c\n1,2,3' + pd.read_csv(StringIO(data), comment='#', skiprows=2) + + If both ``header`` and ``skiprows`` are specified, ``header`` will be + relative to the end of ``skiprows``. For example: + + .. ipython:: python + + data = '# empty\n# second empty line\n# third empty' \ + 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0' + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + .. _io.unicode: Dealing with Unicode Data @@ -372,7 +404,9 @@ result in byte strings being decoded to unicode in the result: df['word'][1] Some formats which encode all characters as multiple bytes, like UTF-16, won't -parse correctly at all without specifying the encoding. +parse correctly at all without specifying the encoding. `Full list of Python +standard encodings +`_ .. _io.index_col: @@ -508,6 +542,25 @@ data columns: specify `index_col` as a column label rather then as an index on the resulting frame. +.. _io.float_precision: + +Specifying method for floating-point conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The parameter ``float_precision`` can be specified in order to use +a specific floating-point converter during parsing with the C engine. +The options are the ordinary converter, the high-precision converter, and +the round-trip converter (which is guaranteed to round-trip values after +writing to a file). For example: + +.. ipython:: python + + val = '0.3066101993807095471566981359501369297504425048828125' + data = 'a,b,c\n1,2,{0}'.format(val) + abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + + Date Parsing Functions ~~~~~~~~~~~~~~~~~~~~~~ Finally, the parser allows you can specify a custom ``date_parser`` function to @@ -2005,7 +2058,12 @@ files if `Xlsxwriter`_ is not available. .. _xlwt: http://www.python-excel.org To specify which writer you want to use, you can pass an engine keyword -argument to ``to_excel`` and to ``ExcelWriter``. +argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: + +- ``openpyxl``: This includes stable support for OpenPyxl 1.6.1 up to but + not including 2.0.0, and experimental support for OpenPyxl 2.0.0 and later. +- ``xlsxwriter`` +- ``xlwt`` .. code-block:: python @@ -2541,17 +2599,17 @@ The right-hand side of the sub-expression (after a comparison operator) can be: string = "HolyMoly'" store.select('df', 'index == %s' % string) - The latter will **not** work and will raise a ``SyntaxError``.Note that - there's a single quote followed by a double quote in the ``string`` - variable. + The latter will **not** work and will raise a ``SyntaxError``.Note that + there's a single quote followed by a double quote in the ``string`` + variable. - If you *must* interpolate, use the ``'%r'`` format specifier + If you *must* interpolate, use the ``'%r'`` format specifier - .. code-block:: python + .. code-block:: python - store.select('df', 'index == %r' % string) + store.select('df', 'index == %r' % string) - which will quote ``string``. + which will quote ``string``. Here are some examples: @@ -3267,6 +3325,12 @@ the database using :func:`~pandas.DataFrame.to_sql`. data.to_sql('data', engine) +With some databases, writing large DataFrames can result in errors due to packet size limitations being exceeded. This can be avoided by setting the ``chunksize`` parameter when calling ``to_sql``. For example, the following writes ``data`` to the database in batches of 1000 rows at a time: + +.. ipython:: python + + data.to_sql('data_chunked', engine, chunksize=1000) + .. note:: Due to the limited support for timedelta's in the different database @@ -3314,6 +3378,20 @@ to pass to :func:`pandas.to_datetime`: You can check if a table exists using :func:`~pandas.io.sql.has_table` +Schema support +~~~~~~~~~~~~~~ + +.. versionadded:: 0.15.0 + +Reading from and writing to different schema's is supported through the ``schema`` +keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` +functions. Note however that this depends on the database flavor (sqlite does not +have schema's). For example: + +.. code-block:: python + + df.to_sql('table', engine, schema='other_schema') + pd.read_sql_table('table', engine, schema='other_schema') Querying ~~~~~~~~ @@ -3333,6 +3411,18 @@ Of course, you can specify a more "complex" query. pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine) +The :func:`~pandas.read_sql_query` function supports a ``chunksize`` argument. +Specifying this will return an iterator through chunks of the query result: + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(20, 3), columns=list('abc')) + df.to_sql('data_chunks', engine, index=False) + +.. ipython:: python + + for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): + print(chunk) You can also run a plain query without creating a dataframe with :func:`~pandas.io.sql.execute`. This is useful for queries that don't return values, @@ -3558,9 +3648,21 @@ read and used to create a ``Categorical`` variable from them. Value labels can also be retrieved by the function ``variable_labels``, which requires data to be called before (see ``pandas.io.stata.StataReader``). +The parameter ``convert_missing`` indicates whether missing value +representations in Stata should be preserved. If ``False`` (the default), +missing values are represented as ``np.nan``. If ``True``, missing values are +represented using ``StataMissingValue`` objects, and columns containing missing +values will have ``dtype`` set to ``object``. + The StataReader supports .dta Formats 104, 105, 108, 113-115 and 117. Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used +.. note:: + + Setting ``preserve_dtypes=False`` will upcast all integer data types to + ``int64`` and all floating point data types to ``float64``. By default, + the Stata data types are preserved when importing. + .. ipython:: python :suppress: diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 55bbf613b33cf..7128e2dd82d6c 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -90,7 +90,7 @@ this using the ``keys`` argument: concatenated As you can see (if you've read the rest of the documentation), the resulting -object's index has a :ref:`hierarchical index `. This +object's index has a :ref:`hierarchical index `. This means that we can now do stuff like select out each chunk by key: .. ipython:: python @@ -100,6 +100,18 @@ means that we can now do stuff like select out each chunk by key: It's not a stretch to see how this can be very useful. More detail on this functionality below. +.. note:: + It is worth noting however, that ``concat`` (and therefore ``append``) makes + a full copy of the data, and that constantly reusing this function can + create a signifcant performance hit. If you need to use the operation over + several datasets, use a list comprehension. + +:: + + frames = [ process_your_file(f) for f in files ] + result = pd.concat(frames) + + Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -118,9 +130,9 @@ behavior: .. ipython:: python - from pandas.util.testing import rands + from pandas.util.testing import rands_array df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'], - index=[rands(5) for _ in range(10)]) + index=rands_array(5, 10)) df concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index d3024daaa59c9..4505d256d31f6 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -105,6 +105,34 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``. df2 df2.get_dtype_counts() +.. _missing.inserting: + +Inserting missing data +---------------------- + +You can insert missing values by simply assigning to containers. The +actual missing value used will be chosen based on the dtype. + +For example, numeric containers will always use ``NaN`` regardless of +the missing value type chosen: + +.. ipython:: python + + s = Series([1, 2, 3]) + s.loc[0] = None + s + +Likewise, datetime containers will always use ``NaT``. + +For object containers, pandas will use the value given: + +.. ipython:: python + + s = Series(["a", "b", "c"]) + s.loc[0] = None + s.loc[1] = np.nan + s + Calculations with missing data ------------------------------ @@ -394,7 +422,7 @@ at the new values. ser = Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + new_index = ser.index | Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) interp_s = ser.reindex(new_index).interpolate(method='pchip') interp_s[49:51] diff --git a/doc/source/options.rst b/doc/source/options.rst index 1e8517014bfc5..5edd28e559bc1 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -86,6 +86,8 @@ pandas namespace. To change an option, call ``set_option('option regex', new_va pd.set_option('mode.sim_interactive', True) pd.get_option('mode.sim_interactive') +**Note:** that the option 'mode.sim_interactive' is mostly used for debugging purposes. + All options also have a default value, and you can use ``reset_option`` to do just that: .. ipython:: python @@ -122,6 +124,25 @@ are restored automatically when you exit the `with` block: print(pd.get_option("display.max_columns")) +Setting Startup Options in python/ipython Environment +----------------------------------------------------- + +Using startup scripts for the python/ipython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at: + +.. code-block:: python + + $IPYTHONDIR/profile_default/startup + +More information can be found in the `ipython documentation +`__. An example startup script for pandas is displayed below: + +.. code-block:: python + + import pandas as pd + pd.set_option('display.max_rows', 999) + pd.set_option('precision', 5) + + Frequently Used Options ----------------------- The following is a walkthrough of the more frequently used display options. @@ -327,6 +348,9 @@ display.max_seq_items 100 when pretty-printing a long sequence, of "..." to the resulting string. If set to None, the number of items to be printed is unlimited. +display.memory_usage True This specifies if the memory usage of + a DataFrame should be displayed when the + df.info() method is invoked. display.mpl_style None Setting this to 'default' will modify the rcParams used by matplotlib to give plots a more pleasing visual diff --git a/doc/source/release.rst b/doc/source/release.rst index 9dc96219f42d9..1bfdee80faaa2 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -45,22 +45,130 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.15.1 +------------- + +**Release date:** (November ??, 2014) + +This is a minor release from 0.15.0 and includes a small number of API changes, several new features, enhancements, and +performance improvements along with a large number of bug fixes. + +Highlights include: + +See the :ref:`v0.15.1 Whatsnew ` overview or the issue tracker on GitHub for an extensive list +of all API changes, enhancements and bugs that have been fixed in 0.15.1. + +Thanks +~~~~~~ + pandas 0.15.0 ------------- -**Release date:** (???) +**Release date:** (October 18, 2014) This is a major release from 0.14.1 and includes a number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. Highlights include: -See the :ref:`v0.15.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list +- Drop support for numpy < 1.7.0 (:issue:`7711`) +- The ``Categorical`` type was integrated as a first-class pandas type, see :ref:`here ` +- New scalar type ``Timedelta``, and a new index type ``TimedeltaIndex``, see :ref:`here ` +- New DataFrame default display for ``df.info()`` to include memory usage, see :ref:`Memory Usage ` +- New datetimelike properties accessor ``.dt`` for Series, see :ref:`Datetimelike Properties ` +- Split indexing documentation into :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing ` +- Split out string methods documentation into :ref:`Working with Text Data ` +- ``read_csv`` will now by default ignore blank lines when parsing, see :ref:`here ` +- API change in using Indexes in set operations, see :ref:`here ` +- Internal refactoring of the ``Index`` class to no longer sub-class ``ndarray``, see :ref:`Internal Refactoring ` +- dropping support for ``PyTables`` less than version 3.0.0, and ``numexpr`` less than version 2.1 (:issue:`7990`) + +See the :ref:`v0.15.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list of all API changes, enhancements and bugs that have been fixed in 0.15.0. Thanks ~~~~~~ +- Aaron Schumacher +- Adam Greenhall +- Andy Hayden +- Anthony O'Brien +- Artemy Kolchinsky +- behzad nouri +- Benedikt Sauer +- benjamin +- Benjamin Thyreau +- Ben Schiller +- bjonen +- BorisVerk +- Chris Reynolds +- Chris Stoafer +- Dav Clark +- dlovell +- DSM +- dsm054 +- FragLegs +- German Gomez-Herrero +- Hsiaoming Yang +- Huan Li +- hunterowens +- Hyungtae Kim +- immerrr +- Isaac Slavitt +- ischwabacher +- Jacob Schaer +- Jacob Wasserman +- Jan Schulz +- Jeff Tratner +- Jesse Farnham +- jmorris0x0 +- jnmclarty +- Joe Bradish +- Joerg Rittinger +- John W. O'Brien +- Joris Van den Bossche +- jreback +- Kevin Sheppard +- klonuo +- Kyle Meyer +- lexual +- Max Chang +- mcjcode +- Michael Mueller +- Michael W Schatzow +- Mike Kelly +- Mortada Mehyar +- mtrbean +- Nathan Sanders +- Nathan Typanski +- onesandzeroes +- Paul Masurel +- Phillip Cloud +- Pietro Battiston +- RenzoBertocchi +- rockg +- Ross Petchler +- seth-p +- Shahul Hameed +- Shashank Agarwal +- sinhrks +- someben +- stahlous +- stas-sl +- Stephan Hoyer +- thatneat +- tom-alcorn +- TomAugspurger +- Tom Augspurger +- Tony Lorenzo +- unknown +- unutbu +- Wes Turner +- Wilfred Hughes +- Yevgeniy Grechka +- Yoshiki Vázquez Baeza +- zachcp + pandas 0.14.1 ------------- diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 8d718bacd262b..ddbfc60a5dfe7 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -77,7 +77,7 @@ this form, use the ``pivot`` function: If the ``values`` argument is omitted, and the input DataFrame has more than one column of values which are not used as column or index inputs to ``pivot``, then the resulting "pivoted" DataFrame will have :ref:`hierarchical columns -` whose topmost level indicates the respective value +` whose topmost level indicates the respective value column: .. ipython:: python @@ -103,7 +103,7 @@ Reshaping by stacking and unstacking Closely related to the ``pivot`` function are the related ``stack`` and ``unstack`` functions currently available on Series and DataFrame. These functions are designed to work together with ``MultiIndex`` objects (see the -section on :ref:`hierarchical indexing `). Here are +section on :ref:`hierarchical indexing `). Here are essentially what these functions do: - ``stack``: "pivot" a level of the (possibly hierarchical) column labels, @@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``: See also :func:`Series.str.get_dummies `. +.. versionadded:: 0.15.0 + +:func:`get_dummies` also accepts a DataFrame. By default all categorical +variables (categorical in the statistical sense, +those with `object` or `categorical` dtype) are encoded as dummy variables. + + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], + 'C': [1, 2, 3]}) + pd.get_dummies(df) + +All non-object columns are included untouched in the output. + +You can control the columns that are encoded with the ``columns`` keyword. + +.. ipython:: python + + pd.get_dummies(df, columns=['A']) + +Notice that the ``B`` column is still included in the output, it just hasn't +been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't +want to include it in the output. + +As with the Series version, you can pass values for the ``prefix`` and +``prefix_sep``. By default the column name is used as the prefix, and '_' as +the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways + +- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column + to be encoded +- list: Must be the same length as the number of columns being encoded. +- dict: Mapping column name to prefix + +.. ipython:: python + + simple = pd.get_dummies(df, prefix='new_prefix') + simple + from_list = pd.get_dummies(df, prefix=['from_A', 'from_B']) + from_list + from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) + from_dict + Factorizing values ------------------ @@ -505,3 +548,10 @@ handling of NaN: pd.factorize(x, sort=True) np.unique(x, return_inverse=True)[::-1] + +.. note:: + If you just want to handle one column as a categorical variable (like R's factor), + you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or + ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, + see the :ref:`Categorical introduction ` and the + :ref:`API documentation `. This feature was introduced in version 0.15. diff --git a/doc/source/rplot.rst b/doc/source/rplot.rst deleted file mode 100644 index 46b57cea2d9ed..0000000000000 --- a/doc/source/rplot.rst +++ /dev/null @@ -1,179 +0,0 @@ -.. currentmodule:: pandas -.. _rplot: - -.. ipython:: python - :suppress: - - import numpy as np - np.random.seed(123456) - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - import matplotlib.pyplot as plt - tips_data = read_csv('data/tips.csv') - iris_data = read_csv('data/iris.data') - from pandas import read_csv - from pandas.tools.plotting import radviz - import pandas.tools.rplot as rplot - plt.close('all') - -************************** -Trellis plotting interface -************************** - -.. note:: - - The tips data set can be downloaded `here - `__. Once you download it execute - - .. code-block:: python - - from pandas import read_csv - tips_data = read_csv('tips.csv') - - from the directory where you downloaded the file. - -We import the rplot API: - -.. ipython:: python - - import pandas.tools.rplot as rplot - --------- -Examples --------- - -RPlot is a flexible API for producing Trellis plots. These plots allow you to arrange data in a rectangular grid by values of certain attributes. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot1_tips.png - plot.render(plt.gcf()) - -In the example above, data from the tips data set is arranged by the attributes 'sex' and 'smoker'. Since both of those attributes can take on one of two values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomDensity()) - - @savefig rplot2_tips.png - plot.render(plt.gcf()) - -Example above is the same as previous except the plot is set to kernel density estimation. This shows how easy it is to have different plots for the same Trellis structure. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomPolyFit(degree=2)) - - @savefig rplot3_tips.png - plot.render(plt.gcf()) - -The plot above shows that it is possible to have two or more plots for the same data displayed on the same Trellis grid cell. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomDensity2D()) - - @savefig rplot4_tips.png - plot.render(plt.gcf()) - -Above is a similar plot but with 2D kernel density estimation plot superimposed. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', '.'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot5_tips.png - plot.render(plt.gcf()) - -It is possible to only use one attribute for grouping data. The example above only uses 'sex' attribute. If the second grouping attribute is not specified, the plots will be arranged in a column. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['.', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot6_tips.png - plot.render(plt.gcf()) - -If the first grouping attribute is not specified the plots will be arranged in a row. - -.. ipython:: python - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['.', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - plot = rplot.RPlot(tips_data, x='tip', y='total_bill') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) - - @savefig rplot7_tips.png - plot.render(plt.gcf()) - -As shown above, scatter plots are also possible. Scatter plots allow you to map various data attributes to graphical properties of the plot. In the example above the colour and shape of the scatter plot graphical objects is mapped to 'day' and 'size' attributes respectively. You use scale objects to specify these mappings. The list of scale classes is given below with initialization arguments for quick reference. - ------- -Scales ------- - -:: - - ScaleGradient(column, colour1, colour2) - -This one allows you to map an attribute (specified by parameter column) value to the colour of a graphical object. The larger the value of the attribute the closer the colour will be to colour2, the smaller the value, the closer it will be to colour1. - -:: - - ScaleGradient2(column, colour1, colour2, colour3) - -The same as ScaleGradient but interpolates linearly between three colours instead of two. - -:: - - ScaleSize(column, min_size, max_size, transform) - -Map attribute value to size of the graphical object. Parameter min_size (default 5.0) is the minimum size of the graphical object, max_size (default 100.0) is the maximum size and transform is a one argument function that will be used to transform the attribute value (defaults to lambda x: x). - -:: - - ScaleShape(column) - -Map the shape of the object to attribute value. The attribute has to be categorical. - -:: - - ScaleRandomColour(column) - -Assign a random colour to a value of categorical attribute specified by column. diff --git a/doc/source/text.rst b/doc/source/text.rst new file mode 100644 index 0000000000000..7032f5ff648a7 --- /dev/null +++ b/doc/source/text.rst @@ -0,0 +1,230 @@ +.. currentmodule:: pandas +.. _text: + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import lrange + options.display.max_rows=15 + +====================== +Working with Text Data +====================== + +.. _text.string_methods: + +Series is equipped with a set of string processing methods +that make it easy to operate on each element of the array. Perhaps most +importantly, these methods exclude missing/NA values automatically. These are +accessed via the Series's ``str`` attribute and generally have names matching +the equivalent (scalar) built-in string methods: + +.. ipython:: python + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() + s.str.upper() + s.str.len() + +Splitting and Replacing Strings +------------------------------- + +.. _text.split: + +Methods like ``split`` return a Series of lists: + +.. ipython:: python + + s2 = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + s2.str.split('_') + +Easy to expand this to return a DataFrame + +.. ipython:: python + + s2.str.split('_').apply(Series) + +Elements in the split lists can be accessed using ``get`` or ``[]`` notation: + +.. ipython:: python + + s2.str.split('_').str.get(1) + s2.str.split('_').str[1] + +Methods like ``replace`` and ``findall`` take `regular expressions +`__, too: + +.. ipython:: python + + s3 = Series(['A', 'B', 'C', 'Aaba', 'Baca', + '', np.nan, 'CABA', 'dog', 'cat']) + s3 + s3.str.replace('^.a|dog', 'XX-XX ', case=False) + +Some caution must be taken to keep regular expressions in mind! For example, the +following code will cause trouble because of the regular expression meaning of +`$`: + +.. ipython:: python + + # Consider the following badly formatted financial data + dollars = Series(['12', '-$10', '$10,000']) + + # This does what you'd naively expect: + dollars.str.replace('$', '') + + # But this doesn't: + dollars.str.replace('-$', '-') + + # We need to escape the special character (for >1 len patterns) + dollars.str.replace(r'-\$', '-') + +Indexing with ``.str`` +---------------------- + +.. _text.indexing: + +You can use ``[]`` notation to directly index by position locations. If you index past the end +of the string, the result will be a ``NaN``. + + +.. ipython:: python + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, + 'CABA', 'dog', 'cat']) + + s.str[0] + s.str[1] + +Extracting Substrings +--------------------- + +.. _text.extract: + +The method ``extract`` (introduced in version 0.13) accepts `regular expressions +`__ with match groups. Extracting a +regular expression with one group returns a Series of strings. + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') + +Elements that do not match return ``NaN``. Extracting a regular expression +with more than one group returns a DataFrame with one column per group. + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') + +Elements that do not match return a row filled with ``NaN``. +Thus, a Series of messy strings can be "converted" into a +like-indexed Series or DataFrame of cleaned-up or more useful strings, +without necessitating ``get()`` to access tuples or ``re.match`` objects. + +The results dtype always is object, even if no match is found and the result +only contains ``NaN``. + +Named groups like + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') + +and optional groups like + +.. ipython:: python + + Series(['a1', 'b2', '3']).str.extract('(?P[ab])?(?P\d)') + +can also be used. + +Testing for Strings that Match or Contain a Pattern +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check whether elements contain a pattern: + +.. ipython:: python + + pattern = r'[a-z][0-9]' + Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern) + +or match a pattern: + + +.. ipython:: python + + Series(['1', '2', '3a', '3b', '03c']).str.match(pattern, as_indexer=True) + +The distinction between ``match`` and ``contains`` is strictness: ``match`` +relies on strict ``re.match``, while ``contains`` relies on ``re.search``. + +.. warning:: + + In previous versions, ``match`` was for *extracting* groups, + returning a not-so-convenient Series of tuples. The new method ``extract`` + (described in the previous section) is now preferred. + + This old, deprecated behavior of ``match`` is still the default. As + demonstrated above, use the new behavior by setting ``as_indexer=True``. + In this mode, ``match`` is analogous to ``contains``, returning a boolean + Series. The new behavior will become the default behavior in a future + release. + +Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take + an extra ``na`` argument so missing values can be considered True or False: + +.. ipython:: python + + s4 = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s4.str.contains('A', na=False) + +Creating Indicator Variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can extract dummy variables from string columns. +For example if they are separated by a ``'|'``: + + .. ipython:: python + + s = Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + +See also :func:`~pandas.get_dummies`. + +Method Summary +-------------- + +.. _text.summary: + +.. csv-table:: + :header: "Method", "Description" + :widths: 20, 80 + + :meth:`~core.strings.StringMethods.cat`,Concatenate strings + :meth:`~core.strings.StringMethods.split`,Split strings on delimiter + :meth:`~core.strings.StringMethods.get`,Index into each element (retrieve i-th element) + :meth:`~core.strings.StringMethods.join`,Join strings in each element of the Series with passed separator + :meth:`~core.strings.StringMethods.contains`,Return boolean array if each string contains pattern/regex + :meth:`~core.strings.StringMethods.replace`,Replace occurrences of pattern/regex with some other string + :meth:`~core.strings.StringMethods.repeat`,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + :meth:`~core.strings.StringMethods.pad`,"Add whitespace to left, right, or both sides of strings" + :meth:`~core.strings.StringMethods.center`,Equivalent to ``pad(side='both')`` + :meth:`~core.strings.StringMethods.wrap`,Split long strings into lines with length less than a given width + :meth:`~core.strings.StringMethods.slice`,Slice each string in the Series + :meth:`~core.strings.StringMethods.slice_replace`,Replace slice in each string with passed value + :meth:`~core.strings.StringMethods.count`,Count occurrences of pattern + :meth:`~core.strings.StringMethods.startswith`,Equivalent to ``str.startswith(pat)`` for each element + :meth:`~core.strings.StringMethods.endswith`,Equivalent to ``str.endswith(pat)`` for each element + :meth:`~core.strings.StringMethods.findall`,Compute list of all occurrences of pattern/regex for each string + :meth:`~core.strings.StringMethods.match`,"Call ``re.match`` on each element, returning matched groups as list" + :meth:`~core.strings.StringMethods.extract`,"Call ``re.match`` on each element, as ``match`` does, but return matched groups as strings for convenience." + :meth:`~core.strings.StringMethods.len`,Compute string lengths + :meth:`~core.strings.StringMethods.strip`,Equivalent to ``str.strip`` + :meth:`~core.strings.StringMethods.rstrip`,Equivalent to ``str.rstrip`` + :meth:`~core.strings.StringMethods.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~core.strings.StringMethods.lower`,Equivalent to ``str.lower`` + :meth:`~core.strings.StringMethods.upper`,Equivalent to ``str.upper`` diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst new file mode 100644 index 0000000000000..1ad5492efe61a --- /dev/null +++ b/doc/source/timedeltas.rst @@ -0,0 +1,402 @@ +.. currentmodule:: pandas +.. _timedeltas: + +.. ipython:: python + :suppress: + + from datetime import datetime, timedelta + import numpy as np + np.random.seed(123456) + from pandas import * + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + options.display.max_rows=15 + import dateutil + import pytz + from dateutil.relativedelta import relativedelta + from pandas.tseries.api import * + from pandas.tseries.offsets import * + +.. _timedeltas.timedeltas: + +*********** +Time Deltas +*********** + +.. note:: + + Starting in v0.15.0, we introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, + but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. + +Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds. +They can be both positive and negative. + +Parsing +------- + +You can construct a ``Timedelta`` scalar thru various arguments: + +.. ipython:: python + + # strings + Timedelta('1 days') + Timedelta('1 days 00:00:00') + Timedelta('1 days 2 hours') + Timedelta('-1 days 2 min 3us') + + # like datetime.timedelta + # note: these MUST be specified as keyword argments + Timedelta(days=1,seconds=1) + + # integers with a unit + Timedelta(1,unit='d') + + # from a timedelta/np.timedelta64 + Timedelta(timedelta(days=1,seconds=1)) + Timedelta(np.timedelta64(1,'ms')) + + # negative Timedeltas have this string repr + # to be more consistent with datetime.timedelta conventions + Timedelta('-1us') + + # a NaT + Timedelta('nan') + Timedelta('nat') + +:ref:`DateOffsets` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction. + +.. ipython:: python + + Timedelta(Second(2)) + +Further, operations among the scalars yield another scalar ``Timedelta`` + +.. ipython:: python + + Timedelta(Day(2)) + Timedelta(Second(2)) + Timedelta('00:00:00.000123') + +to_timedelta +~~~~~~~~~~~~ + +.. warning:: + + Prior to 0.15.0 ``pd.to_timedelta`` would return a ``Series`` for list-like/Series input, and a ``np.timedelta64`` for scalar input. + It will now return a ``TimedeltaIndex`` for list-like input, ``Series`` for Series input, and ``Timedelta`` for scalar input. + + The arguments to ``pd.to_timedelta`` are now ``(arg,unit='ns',box=True)``, previously were ``(arg,box=True,unit='ns')`` as these are more logical. + +Using the top-level ``pd.to_timedelta``, you can convert a scalar, array, list, or Series from a recognized timedelta format / value into a ``Timedelta`` type. +It will construct Series if the input is a Series, a scalar if the input is scalar-like, otherwise will output a ``TimedeltaIndex`` + +.. ipython:: python + + to_timedelta('1 days 06:05:01.00003') + to_timedelta('15.5us') + to_timedelta(['1 days 06:05:01.00003','15.5us','nan']) + to_timedelta(np.arange(5),unit='s') + to_timedelta(np.arange(5),unit='d') + +Operations +---------- + +You can operate on Series/DataFrames and construct ``timedelta64[ns]`` Series thru +subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. + +.. ipython:: python + + s = Series(date_range('2012-1-1', periods=3, freq='D')) + td = Series([ Timedelta(days=i) for i in range(3) ]) + df = DataFrame(dict(A = s, B = td)) + df + df['C'] = df['A'] + df['B'] + df + df.dtypes + + s - s.max() + s - datetime(2011,1,1,3,5) + s + timedelta(minutes=5) + s + Minute(5) + s + Minute(5) + Milli(5) + +Operations with scalars from a ``timedelta64[ns]`` series + +.. ipython:: python + + y = s - s[0] + y + +Series of timedeltas with ``NaT`` values are supported + +.. ipython:: python + + y = s - s.shift() + y + +Elements can be set to ``NaT`` using ``np.nan`` analogously to datetimes + +.. ipython:: python + + y[1] = np.nan + y + +Operands can also appear in a reversed order (a singular object operated with a Series) + +.. ipython:: python + + s.max() - s + datetime(2011,1,1,3,5) - s + timedelta(minutes=5) + s + +``min, max`` and the corresponding ``idxmin, idxmax`` operations are supported on frames + +.. ipython:: python + + A = s - Timestamp('20120101') - Timedelta('00:05:05') + B = s - Series(date_range('2012-1-2', periods=3, freq='D')) + + df = DataFrame(dict(A=A, B=B)) + df + + df.min() + df.min(axis=1) + + df.idxmin() + df.idxmax() + +``min, max, idxmin, idxmax`` operations are supported on Series as well. A scalar result will be a ``Timedelta``. + +.. ipython:: python + + df.min().max() + df.min(axis=1).min() + + df.min().idxmax() + df.min(axis=1).idxmin() + +You can fillna on timedeltas. Integers will be interpreted as seconds. You can +pass a timedelta to get a particular value. + +.. ipython:: python + + y.fillna(0) + y.fillna(10) + y.fillna(Timedelta('-1 days, 00:00:05')) + +You can also negate, multiply and use ``abs`` on ``Timedeltas`` + +.. ipython:: python + + td1 = Timedelta('-1 days 2 hours 3 seconds') + td1 + -1 * td1 + - td1 + abs(td1) + +.. _timedeltas.timedeltas_reductions: + +Reductions +---------- + +Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` objects. As usual +``NaT`` are skipped during evaluation. + +.. ipython:: python + + y2 = Series(to_timedelta(['-1 days +00:00:05','nat','-1 days +00:00:05','1 days'])) + y2 + y2.mean() + y2.median() + y2.quantile(.1) + y2.sum() + +.. _timedeltas.timedeltas_convert: + +Frequency Conversion +-------------------- + +.. versionadded:: 0.13 + +Timedelta Series, ``TimedeltaIndex``, and ``Timedelta`` scalars can be converted to other 'frequencies' by dividing by another timedelta, +or by astyping to a specific timedelta type. These operations yield Series and propogate ``NaT`` -> ``nan``. +Note that division by the numpy scalar is true division, while astyping is equivalent of floor division. + +.. ipython:: python + + td = Series(date_range('20130101',periods=4)) - \ + Series(date_range('20121201',periods=4)) + td[2] += timedelta(minutes=5,seconds=3) + td[3] = np.nan + td + + # to days + td / np.timedelta64(1,'D') + td.astype('timedelta64[D]') + + # to seconds + td / np.timedelta64(1,'s') + td.astype('timedelta64[s]') + + # to months (these are constant months) + td / np.timedelta64(1,'M') + +Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series +yields another ``timedelta64[ns]`` dtypes Series. + +.. ipython:: python + + td * -1 + td * Series([1,2,3,4]) + +Attributes +---------- + +You can access various components of the ``Timedelta`` or ``TimedeltaIndex`` directly using the attributes ``days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds``. +These operations can be directly accessed via the ``.dt`` property of the ``Series`` as well. These return an integer representing that interval (which is signed according to whether the ``Timedelta`` is signed). + +For a ``Series`` + +.. ipython:: python + + td.dt.days + td.dt.seconds + +You can access the component field for a scalar ``Timedelta`` directly. + +.. ipython:: python + + tds = Timedelta('31 days 5 min 3 sec') + tds.days + tds.seconds + (-tds).seconds + +You can use the ``.components`` property to access a reduced form of the timedelta. This returns a ``DataFrame`` indexed +similarly to the ``Series`` + +.. ipython:: python + + td.dt.components + +.. _timedeltas.attribues_warn: + +.. warning:: + + ``Timedelta`` scalars (and ``TimedeltaIndex``) component fields are *not the same* as the component fields on a ``datetime.timedelta`` object. For example, ``.seconds`` on a ``datetime.timedelta`` object returns the total number of seconds combined between ``hours``, ``minutes`` and ``seconds``. In contrast, the pandas ``Timedelta`` breaks out hours, minutes, microseconds and nanoseconds separately. + + .. ipython:: python + + # Timedelta accessor + tds = Timedelta('31 days 5 min 3 sec') + tds.minutes + tds.seconds + + # datetime.timedelta accessor + # this is 5 minutes * 60 + 3 seconds + tds.to_pytimedelta().seconds + + +.. _timedeltas.index: + +TimedeltaIndex +-------------- + +.. versionadded:: 0.15.0 + +To generate an index with time delta, you can use either the TimedeltaIndex or +the ``timedelta_range`` constructor. + +Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, +or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent missing values. + +.. ipython:: python + + TimedeltaIndex(['1 days','1 days, 00:00:05', + np.timedelta64(2,'D'),timedelta(days=2,seconds=2)]) + +Similarly to ``date_range``, you can construct regular ranges of a ``TimedeltaIndex``: + +.. ipython:: python + + timedelta_range(start='1 days',periods=5,freq='D') + timedelta_range(start='1 days',end='2 days',freq='30T') + +Using the TimedeltaIndex +~~~~~~~~~~~~~~~~~~~~~~~~ + +Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodIndex``, you can use +``TimedeltaIndex`` as the index of pandas objects. + +.. ipython:: python + + s = Series(np.arange(100), + index=timedelta_range('1 days',periods=100,freq='h')) + s + +Selections work similary, with coercion on string-likes and slices: + +.. ipython:: python + + s['1 day':'2 day'] + s['1 day 01:00:00'] + s[Timedelta('1 day 1h')] + +Furthermore you can use partial string selection and the range will be inferred: + +.. ipython:: python + + s['1 day':'1 day 5 hours'] + +Operations +~~~~~~~~~~ + +Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow certain combination operations that are NaT preserving: + +.. ipython:: python + + tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days']) + tdi.tolist() + dti = date_range('20130101',periods=3) + dti.tolist() + (dti + tdi).tolist() + (dti - tdi).tolist() + +Conversions +~~~~~~~~~~~ + +Similarly to frequency conversion on a ``Series`` above, you can convert these indices to yield another Index. + +.. ipython:: python + + tdi / np.timedelta64(1,'s') + tdi.astype('timedelta64[s]') + +Scalars type ops work as well. These can potentially return a *different* type of index. + +.. ipython:: python + + # adding or timedelta and date -> datelike + tdi + Timestamp('20130101') + + # subtraction of a date and a timedelta -> datelike + # note that trying to subtract a date from a Timedelta will raise an exception + (Timestamp('20130101') - tdi).tolist() + + # timedelta + timedelta -> timedelta + tdi + Timedelta('10 days') + + # division can result in a Timedelta if the divisor is an integer + tdi / 2 + + # or a Float64Index if the divisor is a Timedelta + tdi / tdi[0] + +.. _timedeltas.resampling: + +Resampling +---------- + +Similar to :ref:`timeseries resampling `, we can resample with a ``TimedeltaIndex``. + +.. ipython:: python + + s.resample('D') diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 8f96ec98df6f2..7b0b0fdf624e8 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1357,6 +1357,9 @@ Pandas provides rich support for working with timestamps in different time zones ``dateutil`` support is new [in 0.14.1] and currently only supported for fixed offset and tzfile zones. The default library is ``pytz``. Support for ``dateutil`` is provided for compatibility with other applications e.g. if you use ``dateutil`` in other python packages. +Working with Time Zones +~~~~~~~~~~~~~~~~~~~~~~~ + By default, pandas objects are time zone unaware: .. ipython:: python @@ -1488,23 +1491,9 @@ TimeSeries, aligning the data on the UTC timestamps: result result.index -In some cases, localize cannot determine the DST and non-DST hours when there are -duplicates. This often happens when reading files that simply duplicate the hours. -The infer_dst argument in tz_localize will attempt -to determine the right offset. - -.. ipython:: python - :okexcept: - - rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', - '11/06/2011 01:00', '11/06/2011 02:00', - '11/06/2011 03:00']) - rng_hourly.tz_localize('US/Eastern') - rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', infer_dst=True) - rng_hourly_eastern.values - - -To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or ``tz_convert(None)``. ``tz_localize(None)`` will remove timezone holding local time representations. ``tz_convert(None)`` will remove timezone after converting to UTC time. +To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or ``tz_convert(None)``. +``tz_localize(None)`` will remove timezone holding local time representations. +``tz_convert(None)`` will remove timezone after converting to UTC time. .. ipython:: python @@ -1516,172 +1505,48 @@ To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None) didx.tz_convert('UCT').tz_localize(None) -.. _timeseries.timedeltas: - -Time Deltas ------------ - -Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds. -They can be both positive and negative. :ref:`DateOffsets` that are absolute in nature -(``Day, Hour, Minute, Second, Milli, Micro, Nano``) can be used as ``timedeltas``. - -.. ipython:: python - - from datetime import datetime, timedelta - s = Series(date_range('2012-1-1', periods=3, freq='D')) - td = Series([ timedelta(days=i) for i in range(3) ]) - df = DataFrame(dict(A = s, B = td)) - df - df['C'] = df['A'] + df['B'] - df - df.dtypes - - s - s.max() - s - datetime(2011,1,1,3,5) - s + timedelta(minutes=5) - s + Minute(5) - s + Minute(5) + Milli(5) - -Getting scalar results from a ``timedelta64[ns]`` series - -.. ipython:: python - - y = s - s[0] - y - -Series of timedeltas with ``NaT`` values are supported - -.. ipython:: python - - y = s - s.shift() - y - -Elements can be set to ``NaT`` using ``np.nan`` analogously to datetimes - -.. ipython:: python - - y[1] = np.nan - y - -Operands can also appear in a reversed order (a singular object operated with a Series) - -.. ipython:: python - - s.max() - s - datetime(2011,1,1,3,5) - s - timedelta(minutes=5) + s - -Some timedelta numeric like operations are supported. - -.. ipython:: python - - td - timedelta(minutes=5, seconds=5, microseconds=5) - -``min, max`` and the corresponding ``idxmin, idxmax`` operations are supported on frames - -.. ipython:: python - - A = s - Timestamp('20120101') - timedelta(minutes=5, seconds=5) - B = s - Series(date_range('2012-1-2', periods=3, freq='D')) - - df = DataFrame(dict(A=A, B=B)) - df +.. _timeseries.timezone_ambiguous: - df.min() - df.min(axis=1) +Ambiguous Times when Localizing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - df.idxmin() - df.idxmax() - -``min, max`` operations are supported on series; these return a single element -``timedelta64[ns]`` Series (this avoids having to deal with numpy timedelta64 -issues). ``idxmin, idxmax`` are supported as well. - -.. ipython:: python - - df.min().max() - df.min(axis=1).min() - - df.min().idxmax() - df.min(axis=1).idxmin() - -You can fillna on timedeltas. Integers will be interpreted as seconds. You can -pass a timedelta to get a particular value. - -.. ipython:: python - - y.fillna(0) - y.fillna(10) - y.fillna(timedelta(days=-1,seconds=5)) - -.. _timeseries.timedeltas_reductions: - -Time Deltas & Reductions ------------------------- - -.. warning:: - - A numeric reduction operation for ``timedelta64[ns]`` can return a single-element ``Series`` of - dtype ``timedelta64[ns]``. - -You can do numeric reduction operations on timedeltas. - -.. ipython:: python - - y2 = y.fillna(timedelta(days=-1,seconds=5)) - y2 - y2.mean() - y2.quantile(.1) - -.. _timeseries.timedeltas_convert: - -Time Deltas & Conversions -------------------------- - -.. versionadded:: 0.13 - -**string/integer conversion** - -Using the top-level ``to_timedelta``, you can convert a scalar or array from the standard -timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). -It can also construct Series. - -.. warning:: - - This requires ``numpy >= 1.7`` +In some cases, localize cannot determine the DST and non-DST hours when there are +duplicates. This often happens when reading files or database records that simply +duplicate the hours. Passing ``ambiguous='infer'`` (``infer_dst`` argument in prior +releases) into ``tz_localize`` will attempt to determine the right offset. Below +the top example will fail as it contains ambiguous times and the bottom will +infer the right offset. .. ipython:: python + :okexcept: - to_timedelta('1 days 06:05:01.00003') - to_timedelta('15.5us') - to_timedelta(['1 days 06:05:01.00003','15.5us','nan']) - to_timedelta(np.arange(5),unit='s') - to_timedelta(np.arange(5),unit='d') + rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00']) -**frequency conversion** + # This will fail as there are ambiguous times + rng_hourly.tz_localize('US/Eastern') + rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', ambiguous='infer') + rng_hourly_eastern.tolist() -Timedeltas can be converted to other 'frequencies' by dividing by another timedelta, -or by astyping to a specific timedelta type. These operations yield ``float64`` dtyped Series. +In addition to 'infer', there are several other arguments supported. Passing +an array-like of bools or 0s/1s where True represents a DST hour and False a +non-DST hour, allows for distinguishing more than one DST +transition (e.g., if you have multiple records in a database each with their +own DST transition). Or passing 'NaT' will fill in transition times +with not-a-time values. These methods are available in the ``DatetimeIndex`` +constructor as well as ``tz_localize``. .. ipython:: python - td = Series(date_range('20130101',periods=4))-Series(date_range('20121201',periods=4)) - td[2] += np.timedelta64(timedelta(minutes=5,seconds=3)) - td[3] = np.nan - td + rng_hourly_dst = np.array([1, 1, 0, 0, 0]) + rng_hourly.tz_localize('US/Eastern', ambiguous=rng_hourly_dst).tolist() + rng_hourly.tz_localize('US/Eastern', ambiguous='NaT').tolist() - # to days - td / np.timedelta64(1,'D') - td.astype('timedelta64[D]') - - # to seconds - td / np.timedelta64(1,'s') - td.astype('timedelta64[s]') - -Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series -yields another ``timedelta64[ns]`` dtypes Series. - -.. ipython:: python + didx = DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx + didx.tz_localize(None) + didx.tz_convert(None) - td * -1 - td * Series([1,2,3,4]) + # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None) + didx.tz_convert('UCT').tz_localize(None) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt deleted file mode 100644 index a6be3a1ed3d0e..0000000000000 --- a/doc/source/v0.15.0.txt +++ /dev/null @@ -1,569 +0,0 @@ -.. _whatsnew_0150: - -v0.15.0 (???) -------------- - -This is a major release from 0.14.1 and includes a small number of API changes, several new features, -enhancements, and performance improvements along with a large number of bug fixes. We recommend that all -users upgrade to this version. - -.. warning:: - - pandas >= 0.15.0 will no longer support compatibility with NumPy versions < - 1.7.0. If you want to use the latest versions of pandas, please upgrade to - NumPy >= 1.7.0. - -- Highlights include: - - - The ``Categorical`` type was integrated as a first-class pandas type, see :ref:`here ` - - Internal refactoring of the ``Index`` class to no longer sub-class ``ndarray``, see :ref:`Internal Refactoring ` - - New datetimelike properties accessor ``.dt`` for Series, see :ref:`Datetimelike Properties ` - - dropping support for ``PyTables`` less than version 3.0.0, and ``numexpr`` less than version 2.1 (:issue:`7990`) - -- :ref:`Other Enhancements ` - -- :ref:`API Changes ` - -- :ref:`Performance Improvements ` - -- :ref:`Prior Deprecations ` - -- :ref:`Deprecations ` - -- :ref:`Known Issues ` - -- :ref:`Bug Fixes ` - -.. warning:: - - In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` - but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be - a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) - -.. _whatsnew_0150.api: - -API changes -~~~~~~~~~~~ - -- Passing multiple levels to `DataFrame.stack()` will now work when multiple level - numbers are passed (:issue:`7660`), and will raise a ``ValueError`` when the - levels aren't all level names or all level numbers. See - :ref:`Reshaping by stacking and unstacking `. - -- :func:`set_names`, :func:`set_labels`, and :func:`set_levels` methods now take an optional ``level`` keyword argument to all modification of specific level(s) of a MultiIndex. Additionally :func:`set_names` now accepts a scalar string value when operating on an ``Index`` or on a specific level of a ``MultiIndex`` (:issue:`7792`) - - .. ipython:: python - - idx = pandas.MultiIndex.from_product([['a'], range(3), list("pqr")], names=['foo', 'bar', 'baz']) - idx.set_names('qux', level=0) - idx.set_names(['qux','baz'], level=[0,1]) - idx.set_levels(['a','b','c'], level='bar') - idx.set_levels([['a','b','c'],[1,2,3]], level=[1,2]) - -- Raise a ``ValueError`` in ``df.to_hdf`` with 'fixed' format, if ``df`` has non-unique columns as the resulting file will be broken (:issue:`7761`) - -- :func:`rolling_min`, :func:`rolling_max`, :func:`rolling_cov`, and :func:`rolling_corr` - now return objects with all ``NaN`` when ``len(arg) < min_periods <= window`` rather - than raising. (This makes all rolling functions consistent in this behavior), (:issue:`7766`) - - Prior to 0.15.0 - - .. ipython:: python - - s = Series([10, 11, 12, 13]) - - .. code-block:: python - - In [15]: rolling_min(s, window=10, min_periods=5) - ValueError: min_periods (5) must be <= window (4) - - New behavior - - .. ipython:: python - - rolling_min(s, window=10, min_periods=5) - -- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcorr`, and :func:`ewmcov` - now have an optional ``ignore_na`` argument. - When ``ignore_na=False`` (the default), missing values are taken into account in the weights calculation. - When ``ignore_na=True`` (which reproduces the pre-0.15.0 behavior), missing values are ignored in the weights calculation. - (:issue:`7543`) - - .. ipython:: python - - ewma(Series([None, 1., 100.]), com=2.5) - ewma(Series([1., None, 100.]), com=2.5, ignore_na=True) # pre-0.15.0 behavior - ewma(Series([1., None, 100.]), com=2.5, ignore_na=False) # default - -- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcorr`, and :func:`ewmcov` - now set to ``NaN`` the first ``min_periods-1`` entries of the result (for ``min_periods>1``). - Previously the first ``min_periods`` entries of the result were set to ``NaN``. - The new behavior accords with the existing documentation. (:issue:`7884`) - -- Bug in passing a ``DatetimeIndex`` with a timezone that was not being retained in DataFrame construction from a dict (:issue:`7822`) - - In prior versions this would drop the timezone. - - .. ipython:: python - - i = date_range('1/1/2011', periods=3, freq='10s', tz = 'US/Eastern') - i - df = DataFrame( {'a' : i } ) - df - df.dtypes - - This behavior is unchanged. - - .. ipython:: python - - df = DataFrame( ) - df['a'] = i - df - df.dtypes - -- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`, :issue:`7950`) - - .. code-block:: python - - In [1]: df = DataFrame(np.arange(0,9), columns=['count']) - - In [2]: df['group'] = 'b' - - In [3]: df.iloc[0:5]['group'] = 'a' - /usr/local/bin/ipython:1: SettingWithCopyWarning: - A value is trying to be set on a copy of a slice from a DataFrame. - Try using .loc[row_indexer,col_indexer] = value instead - - See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy - -- The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no - effect (:issue:`7762`, :issue:`7032`). - -- ``DataFrame.to_stata`` and ``StataWriter`` check string length for - compatibility with limitations imposed in dta files where fixed-width - strings must contain 244 or fewer characters. Attempting to write Stata - dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) - -- ``Index.isin`` now supports a ``level`` argument to specify which index level - to use for membership tests (:issue:`7892`, :issue:`7890`) - - .. code-block:: python - - In [1]: idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]) - - In [2]: idx.values - Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object) - - In [3]: idx.isin(['a', 'c', 'e'], level=1) - Out[3]: array([ True, False, True, True, False, True], dtype=bool) - -- ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time, - previously results in ``Exception`` or ``TypeError`` (:issue:`7812`) - - .. ipython:: python - - ts = Timestamp('2014-08-01 09:00', tz='US/Eastern') - ts - ts.tz_localize(None) - - didx = DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') - didx - didx.tz_localize(None) - -- ``DataFrame.tz_localize`` and ``DataFrame.tz_convert`` now accepts an optional ``level`` argument - for localizing a specific level of a MultiIndex (:issue:`7846`) - -- ``Timestamp.__repr__`` displays ``dateutil.tz.tzoffset`` info (:issue:`7907`) -- ``merge``, ``DataFrame.merge``, and ``ordered_merge`` now return the same type - as the ``left`` argument. (:issue:`7737`) - -- Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. -- Consistency when indexing with ``.loc`` and a list-like indexer when no values are found. - - .. ipython:: python - - df = DataFrame([['a'],['b']],index=[1,2]) - df - - In prior versions there was a difference in these two constructs: - - - ``df.loc[[3]]`` would (prior to 0.15.0) return a frame reindexed by 3 (with all ``np.nan`` values) - - ``df.loc[[3],:]`` would raise ``KeyError``. - - Both will now raise a ``KeyError``. The rule is that *at least 1* indexer must be found when using a list-like and ``.loc`` (:issue:`7999`) - - There was also a difference between ``df.loc[[1,3]]`` (returns a frame reindexed by ``[1, 3]``) and ``df.loc[[1, 3],:]`` (would raise ``KeyError`` prior to 0.15.0). Both will now return a reindexed frame. - - .. ipython:: python - - df.loc[[1,3]] - df.loc[[1,3],:] - - This can also be seen in multi-axis indexing with a ``Panel``. - - .. ipython:: python - - p = Panel(np.arange(2*3*4).reshape(2,3,4), - items=['ItemA','ItemB'],major_axis=[1,2,3],minor_axis=['A','B','C','D']) - p - - The following would raise ``KeyError`` prior to 0.15.0: - - .. ipython:: python - - p.loc[['ItemA','ItemD'],:,'D'] - - Furthermore, ``.loc`` will raise If no values are found in a multi-index with a list-like indexer: - - .. ipython:: python - :okexcept: - - s = Series(np.arange(3,dtype='int64'),index=MultiIndex.from_product([['A'],['foo','bar','baz']], - names=['one','two'])).sortlevel() - s - s.loc[['D']] - -- ``Index`` now supports ``duplicated`` and ``drop_duplicates``. (:issue:`4060`) - - .. ipython:: python - - idx = Index([1, 2, 3, 4, 1, 2]) - idx - idx.duplicated() - idx.drop_duplicates() - -.. _whatsnew_0150.dt: - -.dt accessor -~~~~~~~~~~~~ - -``Series`` has gained an accessor to succinctly return datetime like properties for the *values* of the Series, if its a datetime/period like Series. (:issue:`7207`) -This will return a Series, indexed like the existing Series. See the :ref:`docs ` - -.. ipython:: python - - # datetime - s = Series(date_range('20130101 09:10:12',periods=4)) - s - s.dt.hour - s.dt.second - s.dt.day - -This enables nice expressions like this: - -.. ipython:: python - - s[s.dt.day==2] - -.. ipython:: python - - # period - s = Series(period_range('20130101',periods=4,freq='D').asobject) - s - s.dt.year - s.dt.day - -.. _whatsnew_0150.refactoring: - -Internal Refactoring -~~~~~~~~~~~~~~~~~~~~ - -In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` -but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be -a transparent change with only very limited API implications (:issue:`5080`, :issue:`7439`, :issue:`7796`, :issue:`8024`) - -- you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` -- when plotting with a ``PeriodIndex``. The ``matplotlib`` internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex``. (this is similar to how a ``DatetimeIndex`` passes arrays of ``datetimes`` now) -- MultiIndexes will now raise similary to other pandas objects w.r.t. truth testing, See :ref:`here ` (:issue:`7897`). - -.. _whatsnew_0150.cat: - -Categoricals in Series/DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new -methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, -:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`). - -For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. - -.. ipython:: python - - df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) - - # convert the raw grades to a categorical - df["grade"] = pd.Categorical(df["raw_grade"]) - - # Alternative: df["grade"] = df["raw_grade"].astype("category") - df["grade"] - - # Rename the levels - df["grade"].cat.levels = ["very good", "good", "very bad"] - - # Reorder the levels and simultaneously add the missing levels - df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) - df["grade"] - df.sort("grade") - df.groupby("grade").size() - -- ``pandas.core.group_agg`` and ``pandas.core.factor_agg`` were removed. As an alternative, construct - a dataframe and use ``df.groupby().agg()``. - -- Supplying "codes/labels and levels" to the :class:`~pandas.Categorical` constructor is deprecated and does - not work without supplying ``compat=True``. The default mode now uses "values and levels". - Please change your code to use the :meth:`~pandas.Categorical.from_codes` constructor. - -- The ``Categorical.labels`` attribute was renamed to ``Categorical.codes`` and is read - only. If you want to manipulate codes, please use one of the - :ref:`API methods on Categoricals `. - - - - - - - - - - - - - - - - - - - - - - - - -.. _whatsnew_0150.prior_deprecations: - -Prior Version Deprecations/Changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are no prior version deprecations that are taking effect as of 0.15.0. - -.. _whatsnew_0150.deprecations: - -Deprecations -~~~~~~~~~~~~ - -.. _whatsnew_0150.knownissues: - -Known Issues -~~~~~~~~~~~~ - -.. _whatsnew_0150.enhancements: - -Enhancements -~~~~~~~~~~~~ -- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`) - - -- ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`) -- ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`) -- ``pandas.tseries.holiday.Holiday`` now supports a list of offsets in Python3 (:issue:`7070`) - - - - -- ``Period`` and ``PeriodIndex`` supports addition/subtraction with ``timedelta``-likes (:issue:`7966`) - - If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added. - - .. ipython:: python - - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx - idx + pd.offsets.Hour(2) - idx + timedelta(minutes=120) - idx + np.timedelta64(7200, 's') - - idx = pd.period_range('2014-07', periods=5, freq='M') - idx - idx + pd.offsets.MonthEnd(3) - - - - - - - - - - - - - - - - - - - - -.. _whatsnew_0150.performance: - -Performance -~~~~~~~~~~~ - -- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) -- Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) -- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) - - - - - - - - - - - - - - - - - - - - - -.. _whatsnew_0150.experimental: - -Experimental -~~~~~~~~~~~~ - -There are no experimental changes in 0.15.0 - -.. _whatsnew_0150.bug_fixes: - -Bug Fixes -~~~~~~~~~ -- Bug in multiindexes dtypes getting mixed up when DataFrame is saved to SQL table (:issue:`8021`) -- Bug in Series 0-division with a float and integer operand dtypes (:issue:`7785`) -- Bug in ``Series.astype("unicode")`` not calling ``unicode`` on the values correctly (:issue:`7758`) -- Bug in ``DataFrame.as_matrix()`` with mixed ``datetime64[ns]`` and ``timedelta64[ns]`` dtypes (:issue:`7778`) -- Bug in ``HDFStore.select_column()`` not preserving UTC timezone info when selecting a DatetimeIndex (:issue:`7777`) -- Bug in ``to_datetime`` when ``format='%Y%m%d'`` and ``coerce=True`` are specified, where previously an object array was returned (rather than - a coerced time-series with ``NaT``), (:issue:`7930`) -- Bug in ``DatetimeIndex`` and ``PeriodIndex`` in-place addition and subtraction cause different result from normal one (:issue:`6527`) -- Bug in adding and subtracting ``PeriodIndex`` with ``PeriodIndex`` raise ``TypeError`` (:issue:`7741`) -- Bug in ``combine_first`` with ``PeriodIndex`` data raises ``TypeError`` (:issue:`3367`) -- Bug in multi-index slicing with missing indexers (:issue:`7866`) -- Regression in multi-index indexing with a non-scalar type object (:issue:`7914`) - -- Bug in pickles contains ``DateOffset`` may raise ``AttributeError`` when ``normalize`` attribute is reffered internally (:issue:`7748`) - -- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity - when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`) - -- Bug in HDFStore iteration when passing a where (:issue:`8014`) - -- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) - - -- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may reset nanosecond (:issue:`7697`) -- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may raise ``AttributeError`` if ``Timestamp`` has ``dateutil`` tzinfo (:issue:`7697`) - - -- Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`) - -- Bug in ``PeriodIndex.unique`` returns int64 ``np.ndarray`` (:issue:`7540`) - -- Bug in ``DataFrame.reset_index`` which has ``MultiIndex`` contains ``PeriodIndex`` or ``DatetimeIndex`` with tz raises ``ValueError`` (:issue:`7746`, :issue:`7793`) - - - -- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) -- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) -- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`) -- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr`` - returning results with columns sorted by name and producing an error for non-unique columns; - now handles non-unique columns and returns columns in original order - (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`) - - -- Bug in ``DataFrame.plot`` and ``Series.plot`` may ignore ``rot`` and ``fontsize`` keywords (:issue:`7844`) - - -- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`) -- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`) - - - -- Bug in ``GroupBy.transform()`` where int groups with a transform that - didn't preserve the index were incorrectly truncated (:issue:`7972`). - -- Bug in ``groupby`` where callable objects without name attributes would take the wrong path, - and produce a ``DataFrame`` instead of a ``Series`` (:issue:`7929`) - - -- Bug in ``read_html`` where the ``infer_types`` argument forced coercion of - date-likes incorrectly (:issue:`7762`, :issue:`7032`). - - -- Bug in ``Series.str.cat`` with an index which was filtered as to not include the first item (:issue:`7857`) - - -- Bug in ``Timestamp`` cannot parse ``nanosecond`` from string (:issue:`7878`) -- Bug in ``Timestamp`` with string offset and ``tz`` results incorrect (:issue:`7833`) - -- Bug in ``tslib.tz_convert`` and ``tslib.tz_convert_single`` may return different results (:issue:`7798`) -- Bug in ``DatetimeIndex.intersection`` of non-overlapping timestamps with tz raises ``IndexError`` (:issue:`7880`) - - - -- Bug in ``GroupBy.filter()`` where fast path vs. slow path made the filter - return a non scalar value that appeared valid but wasn't (:issue:`7870`). -- Bug in ``date_range()``/``DatetimeIndex()`` when the timezone was inferred from input dates yet incorrect - times were returned when crossing DST boundaries (:issue:`7835`, :issue:`7901`). - - -- Bug in area plot draws legend with incorrect ``alpha`` when ``stacked=True`` (:issue:`8027`) - -- ``Period`` and ``PeriodIndex`` addition/subtraction with ``np.timedelta64`` results in incorrect internal representations (:issue:`7740`) - - - - - - - - - - - - - - -- Bug in installation where ``html_encoding/*.html`` wasn't installed and - therefore some tests were not running correctly (:issue:`7927`). - -- Bug in ``read_html`` where ``bytes`` objects were not tested for in - ``_read`` (:issue:`7927`). - - - - - - - - - - - - - - - - - - - diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 40b5d7c1599c1..f30d6c9d5d4c0 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -13,7 +13,11 @@ np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.close('all') - options.display.mpl_style = 'default' + import matplotlib + try: + matplotlib.style.use('ggplot') + except AttributeError: + options.display.mpl_style = 'default' options.display.max_rows = 15 from pandas.compat import lrange @@ -29,14 +33,11 @@ We use the standard convention for referencing the matplotlib API: .. versionadded:: 0.11.0 -The ``display.mpl_style`` produces more appealing plots. +The plots in this document are made using matplotlib's ``ggplot`` style (new in version 1.4). +If your version of matplotlib is 1.3 or lower, setting the ``display.mpl_style`` to ``'default'`` +with ``pd.options.display.mpl_style = 'default'`` +to produce more appealing plots. When set, matplotlib's ``rcParams`` are changed (globally!) to nicer-looking settings. -All the plots in the documentation are rendered with this option set to the -'default' style. - -.. ipython:: python - - pd.options.display.mpl_style = 'default' We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization @@ -77,6 +78,7 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python :suppress: + plt.close('all') np.random.seed(123456) .. ipython:: python @@ -93,6 +95,7 @@ You can plot one column versus another using the `x` and `y` keywords in .. ipython:: python :suppress: + plt.close('all') plt.figure() np.random.seed(123456) @@ -124,9 +127,10 @@ These include: * :ref:`'bar' ` or :ref:`'barh' ` for bar plots * :ref:`'hist' ` for histogram +* :ref:`'box' ` for boxplot * :ref:`'kde' ` or ``'density'`` for density plots * :ref:`'area' ` for area plots -* :ref:`'scatter' ` for scatter plots +* :ref:`'scatter' ` for scatter plots * :ref:`'hexbin' ` for hexagonal bin plots * :ref:`'pie' ` for pie plots @@ -168,6 +172,7 @@ bar plot: .. ipython:: python :suppress: + plt.close('all') plt.figure() np.random.seed(123456) @@ -183,6 +188,7 @@ To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python :suppress: + plt.close('all') plt.figure() .. ipython:: python @@ -195,6 +201,7 @@ To get horizontal bar plots, pass ``kind='barh'``: .. ipython:: python :suppress: + plt.close('all') plt.figure() .. ipython:: python @@ -221,6 +228,12 @@ Histogram can be drawn specifying ``kind='hist'``. @savefig hist_new.png df4.plot(kind='hist', alpha=0.5) + +.. ipython:: python + :suppress: + + plt.close('all') + Histogram can be stacked by ``stacked=True``. Bin size can be changed by ``bins`` keyword. .. ipython:: python @@ -230,6 +243,11 @@ Histogram can be stacked by ``stacked=True``. Bin size can be changed by ``bins` @savefig hist_new_stacked.png df4.plot(kind='hist', stacked=True, bins=20) +.. ipython:: python + :suppress: + + plt.close('all') + You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histgram can be drawn by ``orientation='horizontal'`` and ``cumulative='True'``. .. ipython:: python @@ -239,12 +257,16 @@ You can pass other keywords supported by matplotlib ``hist``. For example, horiz @savefig hist_new_kwargs.png df4['a'].plot(kind='hist', orientation='horizontal', cumulative=True) +.. ipython:: python + :suppress: + + plt.close('all') See the :meth:`hist ` method and the `matplotlib hist documenation `__ for more. -The previous interface ``DataFrame.hist`` to plot histogram still can be used. +The existing interface ``DataFrame.hist`` to plot histogram still can be used. .. ipython:: python @@ -253,6 +275,10 @@ The previous interface ``DataFrame.hist`` to plot histogram still can be used. @savefig hist_plot_ex.png df['A'].diff().hist() +.. ipython:: python + :suppress: + + plt.close('all') :meth:`DataFrame.hist` plots the histograms of the columns on multiple subplots: @@ -272,6 +298,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python :suppress: + plt.close('all') plt.figure() np.random.seed(123456) @@ -288,8 +315,12 @@ The ``by`` keyword can be specified to plot grouped histograms: Box Plots ~~~~~~~~~ -DataFrame has a :meth:`~DataFrame.boxplot` method that allows you to visualize the -distribution of values within each column. +Boxplot can be drawn calling a ``Series`` and ``DataFrame.plot`` with ``kind='box'``, +or ``DataFrame.boxplot`` to visualize the distribution of values within each column. + +.. versionadded:: 0.15.0 + +``plot`` method now supports ``kind='box'`` to draw boxplot. For instance, here is a boxplot representing five trials of 10 observations of a uniform random variable on [0,1). @@ -297,6 +328,62 @@ a uniform random variable on [0,1). .. ipython:: python :suppress: + plt.close('all') + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + + @savefig box_plot_new.png + df.plot(kind='box') + +Boxplot can be colorized by passing ``color`` keyword. You can pass a ``dict`` +whose keys are ``boxes``, ``whiskers``, ``medians`` and ``caps``. +If some keys are missing in the ``dict``, default colors are used +for the corresponding artists. Also, boxplot has ``sym`` keyword to specify fliers style. + +When you pass other type of arguments via ``color`` keyword, it will be directly +passed to matplotlib for all the ``boxes``, ``whiskers``, ``medians`` and ``caps`` +colorization. + +The colors are applied to every boxes to be drawn. If you want +more complicated colorization, you can get each drawn artists by passing +:ref:`return_type `. + +.. ipython:: python + + color = dict(boxes='DarkGreen', whiskers='DarkOrange', + medians='DarkBlue', caps='Gray') + + @savefig box_new_colorize.png + df.plot(kind='box', color=color, sym='r+') + +.. ipython:: python + :suppress: + + plt.close('all') + +Also, you can pass other keywords supported by matplotlib ``boxplot``. +For example, horizontal and custom-positioned boxplot can be drawn by +``vert=False`` and ``positions`` keywords. + +.. ipython:: python + + @savefig box_new_kwargs.png + df.plot(kind='box', vert=False, positions=[1, 4, 5, 6, 8]) + + +See the :meth:`boxplot ` method and the +`matplotlib boxplot documenation `__ for more. + + +The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. + +.. ipython:: python + :suppress: + + plt.close('all') np.random.seed(123456) .. ipython:: python @@ -314,6 +401,7 @@ groupings. For instance, .. ipython:: python :suppress: + plt.close('all') np.random.seed(123456) .. ipython:: python @@ -333,6 +421,7 @@ columns: .. ipython:: python :suppress: + plt.close('all') np.random.seed(123456) .. ipython:: python @@ -354,18 +443,23 @@ columns: .. _visualization.box.return: -The return type of ``boxplot`` depends on two keyword arguments: ``by`` and ``return_type``. -When ``by`` is ``None``: +Basically, plot functions return :class:`matplotlib Axes ` as a return value. +In ``boxplot``, the return type can be changed by argument ``return_type``, and whether the subplots is enabled (``subplots=True`` in ``plot`` or ``by`` is specified in ``boxplot``). + +When ``subplots=False`` / ``by`` is ``None``: * if ``return_type`` is ``'dict'``, a dictionary containing the :class:`matplotlib Lines ` is returned. The keys are "boxes", "caps", "fliers", "medians", and "whiskers". - This is the default. + This is the default of ``boxplot`` in historical reason. + Note that ``plot(kind='box')`` returns ``Axes`` as default as the same as other plots. * if ``return_type`` is ``'axes'``, a :class:`matplotlib Axes ` containing the boxplot is returned. * if ``return_type`` is ``'both'`` a namedtuple containging the :class:`matplotlib Axes ` and :class:`matplotlib Lines ` is returned -When ``by`` is some column of the DataFrame, a dict of ``return_type`` is returned, where -the keys are the columns of the DataFrame. The plot has a facet for each column of -the DataFrame, with a separate box for each value of ``by``. +When ``subplots=True`` / ``by`` is some column of the DataFrame: + +* A dict of ``return_type`` is returned, where the keys are the columns + of the DataFrame. The plot has a facet for each column of + the DataFrame, with a separate box for each value of ``by``. Finally, when calling boxplot on a :class:`Groupby` object, a dict of ``return_type`` is returned, where the keys are the same as the Groupby object. The plot has a @@ -383,6 +477,11 @@ DataFrame. @savefig boxplot_groupby.png bp = df_box.boxplot(by='g') +.. ipython:: python + :suppress: + + plt.close('all') + Compare to: .. ipython:: python @@ -391,6 +490,11 @@ Compare to: @savefig groupby_boxplot_vis.png bp = df_box.groupby('g').boxplot() +.. ipython:: python + :suppress: + + plt.close('all') + .. _visualization.area_plot: Area Plot @@ -420,6 +524,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 .. ipython:: python :suppress: + plt.close('all') plt.figure() .. ipython:: python @@ -427,6 +532,77 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 @savefig area_plot_unstacked.png df.plot(kind='area', stacked=False); +.. _visualization.scatter: + +Scatter Plot +~~~~~~~~~~~~ + +.. versionadded:: 0.13 + +You can create scatter plots with ``DataFrame.plot`` by passing ``kind='scatter'``. +Scatter plot requires numeric columns for x and y axis. +These can be specified by ``x`` and ``y`` keywords each. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + plt.close('all') + plt.figure() + +.. ipython:: python + + df = DataFrame(rand(50, 4), columns=['a', 'b', 'c', 'd']) + + @savefig scatter_plot.png + df.plot(kind='scatter', x='a', y='b'); + +To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. +It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. + +.. ipython:: python + + ax = df.plot(kind='scatter', x='a', y='b', + color='DarkBlue', label='Group 1'); + @savefig scatter_plot_repeated.png + df.plot(kind='scatter', x='c', y='d', + color='DarkGreen', label='Group 2', ax=ax); + +.. ipython:: python + :suppress: + + plt.close('all') + +The keyword ``c`` may be given as the name of a column to provide colors for +each point: + +.. ipython:: python + + @savefig scatter_plot_colored.png + df.plot(kind='scatter', x='a', y='b', c='c', s=50); + + +.. ipython:: python + :suppress: + + plt.close('all') + +You can pass other keywords supported by matplotlib ``scatter``. +Below example shows a bubble chart using a dataframe column values as bubble size. + +.. ipython:: python + + @savefig scatter_plot_bubble.png + df.plot(kind='scatter', x='a', y='b', s=df['c']*200); + +.. ipython:: python + :suppress: + + plt.close('all') + +See the :meth:`scatter ` method and the +`matplotlib scatter documenation `__ for more. + .. _visualization.hexbin: Hexagonal Bin Plot @@ -469,6 +645,7 @@ given by column ``z``. The bins are aggregated with numpy's ``max`` function. .. ipython:: python :suppress: + plt.close('all') plt.figure() np.random.seed(123456) @@ -482,6 +659,10 @@ given by column ``z``. The bins are aggregated with numpy's ``max`` function. df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) +.. ipython:: python + :suppress: + + plt.close('all') See the :meth:`hexbin ` method and the `matplotlib hexbin documenation `__ for more. @@ -508,7 +689,16 @@ A ``ValueError`` will be raised if there are any negative values in your data. series = Series(3 * rand(4), index=['a', 'b', 'c', 'd'], name='series') @savefig series_pie_plot.png - series.plot(kind='pie') + series.plot(kind='pie', figsize=(6, 6)) + +.. ipython:: python + :suppress: + + plt.close('all') + +For pie plots it's best to use square figures, one's with an equal aspect ratio. You can create the +figure with equal width and height, or force the aspect ratio to be equal after plotting by +calling ``ax.set_aspect('equal')`` on the returned ``axes`` object. Note that pie plot with :class:`DataFrame` requires that you either specify a target column by the ``y`` argument or ``subplots=True``. When ``y`` is specified, pie plot of selected column @@ -526,7 +716,12 @@ A legend will be drawn in each pie plots by default; specify ``legend=False`` to df = DataFrame(3 * rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) @savefig df_pie_plot.png - df.plot(kind='pie', subplots=True) + df.plot(kind='pie', subplots=True, figsize=(8, 4)) + +.. ipython:: python + :suppress: + + plt.close('all') You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. @@ -549,13 +744,14 @@ Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. @savefig series_pie_plot_options.png series.plot(kind='pie', labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], - autopct='%.2f', fontsize=20) + autopct='%.2f', fontsize=20, figsize=(6, 6)) If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. .. ipython:: python :suppress: + plt.close('all') plt.figure() .. ipython:: python @@ -563,7 +759,7 @@ If you pass values whose sum total is less than 1.0, matplotlib draws a semicirc series = Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') @savefig series_pie_plot_semi.png - series.plot(kind='pie') + series.plot(kind='pie', figsize=(6, 6)) See the `matplotlib pie documenation `__ for more. @@ -572,6 +768,44 @@ See the `matplotlib pie documenation `, @@ -829,6 +1098,11 @@ shown by default. @savefig frame_plot_basic_noleg.png df.plot(legend=False) +.. ipython:: python + :suppress: + + plt.close('all') + Scales ~~~~~~ @@ -840,7 +1114,6 @@ You may pass ``logy`` to get a log-scale Y axis. plt.figure() np.random.seed(123456) - .. ipython:: python ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) @@ -849,6 +1122,11 @@ You may pass ``logy`` to get a log-scale Y axis. @savefig series_plot_logy.png ts.plot(logy=True) +.. ipython:: python + :suppress: + + plt.close('all') + See also the ``logx`` and ``loglog`` keyword arguments. Plotting on a Secondary Y-axis @@ -868,6 +1146,11 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: @savefig series_plot_secondary_y.png df.B.plot(secondary_y=True, style='g') +.. ipython:: python + :suppress: + + plt.close('all') + To plot some columns in a DataFrame, give the column names to the ``secondary_y`` keyword: @@ -879,6 +1162,10 @@ keyword: @savefig frame_plot_secondary_y.png ax.right_ax.set_ylabel('AB scale') +.. ipython:: python + :suppress: + + plt.close('all') Note that the columns plotted on the secondary y-axis is automatically marked with "(right)" in the legend. To turn off the automatic marking, use the @@ -891,11 +1178,15 @@ with "(right)" in the legend. To turn off the automatic marking, use the @savefig frame_plot_secondary_y_no_right.png df.plot(secondary_y=['A', 'B'], mark_right=False) +.. ipython:: python + :suppress: + + plt.close('all') Suppressing Tick Resolution Adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -pandas includes automatically tick resolution adjustment for regular frequency +pandas includes automatic tick resolution adjustment for regular frequency time-series data. For limited cases where pandas cannot infer the frequency information (e.g., in an externally created ``twinx``), you can choose to suppress this behavior for alignment purposes. @@ -909,6 +1200,10 @@ Here is the default behavior, notice how the x-axis tick labelling is performed: @savefig ser_plot_suppress.png df.A.plot() +.. ipython:: python + :suppress: + + plt.close('all') Using the ``x_compat`` parameter, you can suppress this behavior: @@ -919,6 +1214,10 @@ Using the ``x_compat`` parameter, you can suppress this behavior: @savefig ser_plot_suppress_parm.png df.A.plot(x_compat=True) +.. ipython:: python + :suppress: + + plt.close('all') If you have more than one plot that needs to be suppressed, the ``use`` method in ``pandas.plot_params`` can be used in a `with statement`: @@ -935,6 +1234,11 @@ in ``pandas.plot_params`` can be used in a `with statement`: df.B.plot(color='g') df.C.plot(color='b') +.. ipython:: python + :suppress: + + plt.close('all') + Subplots ~~~~~~~~ @@ -946,10 +1250,72 @@ with the ``subplots`` keyword: @savefig frame_plot_subplots.png df.plot(subplots=True, figsize=(6, 6)); -Targeting Different Subplots -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. ipython:: python + :suppress: + + plt.close('all') + +Using Layout and Targetting Multiple Axes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The layout of subplots can be specified by ``layout`` keyword. It can accept +``(rows, columns)``. The ``layout`` keyword can be used in +``hist`` and ``boxplot`` also. If input is invalid, ``ValueError`` will be raised. + +The number of axes which can be contained by rows x columns specified by ``layout`` must be +larger than the number of required subplots. If layout can contain more axes than required, +blank axes are not drawn. Similar to a numpy array's ``reshape`` method, you +can use ``-1`` for one dimension to automatically calculate the number of rows +or columns needed, given the other. + +.. ipython:: python + + @savefig frame_plot_subplots_layout.png + df.plot(subplots=True, layout=(3, 2), figsize=(6, 6), sharex=False); + +.. ipython:: python + :suppress: + + plt.close('all') + +The above example is identical to using + +.. ipython:: python + + df.plot(subplots=True, layout=(3, -1), figsize=(6, 6), sharex=False); + +.. ipython:: python + :suppress: + + plt.close('all') + +The required number of columns (2) is inferred from the number of series to plot +and the given number of rows (3). + +Also, you can pass multiple axes created beforehand as list-like via ``ax`` keyword. +This allows to use more complicated layout. +The passed axes must be the same number as the subplots being drawn. + +When multiple axes are passed via ``ax`` keyword, ``layout``, ``sharex`` and ``sharey`` keywords are ignored. +These must be configured when creating axes. + +.. ipython:: python + + fig, axes = plt.subplots(4, 4, figsize=(6, 6)); + plt.subplots_adjust(wspace=0.5, hspace=0.5); + target1 = [axes[0][0], axes[1][1], axes[2][2], axes[3][3]] + target2 = [axes[3][0], axes[2][1], axes[1][2], axes[0][3]] -You can pass an ``ax`` argument to :meth:`Series.plot` to plot on a particular axis: + df.plot(subplots=True, ax=target1, legend=False); + @savefig frame_plot_subplots_multi_ax.png + (-df).plot(subplots=True, ax=target2, legend=False); + +.. ipython:: python + :suppress: + + plt.close('all') + +Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a particular axis: .. ipython:: python :suppress: @@ -961,15 +1327,20 @@ You can pass an ``ax`` argument to :meth:`Series.plot` to plot on a particular a df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() +.. ipython:: python + :suppress: + + plt.close('all') + .. ipython:: python fig, axes = plt.subplots(nrows=2, ncols=2) - df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A') - df['B'].plot(ax=axes[0,1]); axes[0,1].set_title('B') - df['C'].plot(ax=axes[1,0]); axes[1,0].set_title('C') + df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A'); + df['B'].plot(ax=axes[0,1]); axes[0,1].set_title('B'); + df['C'].plot(ax=axes[1,0]); axes[1,0].set_title('C'); @savefig series_plot_multi.png - df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') + df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D'); .. ipython:: python :suppress: @@ -1013,6 +1384,11 @@ Here is an example of one way to easily plot group means with standard deviation @savefig errorbar_example.png means.plot(yerr=errors, ax=ax, kind='bar') +.. ipython:: python + :suppress: + + plt.close('all') + .. _visualization.table: Plotting Tables @@ -1036,6 +1412,11 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : @savefig line_plot_table_true.png df.plot(table=True, ax=ax) +.. ipython:: python + :suppress: + + plt.close('all') + Also, you can pass different :class:`DataFrame` or :class:`Series` for ``table`` keyword. The data will be drawn as displayed in print method (not transposed automatically). If required, it should be transposed manually as below example. .. ipython:: python @@ -1045,6 +1426,10 @@ Also, you can pass different :class:`DataFrame` or :class:`Series` for ``table`` @savefig line_plot_table_data.png df.plot(table=np.round(df.T, 2), ax=ax) +.. ipython:: python + :suppress: + + plt.close('all') Finally, there is a helper function ``pandas.tools.plotting.table`` to create a table from :class:`DataFrame` and :class:`Series`, and add it to an ``matplotlib.Axes``. This function can accept keywords which matplotlib table has. @@ -1059,6 +1444,11 @@ Finally, there is a helper function ``pandas.tools.plotting.table`` to create a @savefig line_plot_table_describe.png df.plot(ax=ax, ylim=(0, 2), legend=None) +.. ipython:: python + :suppress: + + plt.close('all') + **Note**: You can get table instances on the axes using ``axes.tables`` property for further decorations. See the `matplotlib table documenation `__ for more. .. _visualization.colormaps: @@ -1079,7 +1469,7 @@ colors are selected based on an even spacing determined by the number of columns in the DataFrame. There is no consideration made for background color, so some colormaps will produce lines that are not easily visible. -To use the cubhelix colormap, we can simply pass ``'cubehelix'`` to ``colormap=`` +To use the cubehelix colormap, we can simply pass ``'cubehelix'`` to ``colormap=`` .. ipython:: python :suppress: @@ -1096,6 +1486,11 @@ To use the cubhelix colormap, we can simply pass ``'cubehelix'`` to ``colormap=` @savefig cubehelix.png df.plot(colormap='cubehelix') +.. ipython:: python + :suppress: + + plt.close('all') + or we can pass the colormap itself .. ipython:: python @@ -1107,6 +1502,11 @@ or we can pass the colormap itself @savefig cubehelix_cm.png df.plot(colormap=cm.cubehelix) +.. ipython:: python + :suppress: + + plt.close('all') + Colormaps can also be used other plot types, like bar charts: .. ipython:: python @@ -1124,6 +1524,11 @@ Colormaps can also be used other plot types, like bar charts: @savefig greens.png dd.plot(kind='bar', colormap='Greens') +.. ipython:: python + :suppress: + + plt.close('all') + Parallel coordinates charts: .. ipython:: python @@ -1133,6 +1538,11 @@ Parallel coordinates charts: @savefig parallel_gist_rainbow.png parallel_coordinates(data, 'Name', colormap='gist_rainbow') +.. ipython:: python + :suppress: + + plt.close('all') + Andrews curves charts: .. ipython:: python @@ -1142,6 +1552,10 @@ Andrews curves charts: @savefig andrews_curve_winter.png andrews_curves(data, 'Name', colormap='winter') +.. ipython:: python + :suppress: + + plt.close('all') Plotting directly with matplotlib --------------------------------- @@ -1185,3 +1599,218 @@ when plotting a large number of points. :suppress: plt.close('all') + + +.. _rplot: + + +Trellis plotting interface +-------------------------- + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + tips_data = read_csv('data/tips.csv') + iris_data = read_csv('data/iris.data') + from pandas import read_csv + from pandas.tools.plotting import radviz + import pandas.tools.rplot as rplot + plt.close('all') + + +.. note:: + + The tips data set can be downloaded `here + `__. Once you download it execute + + .. code-block:: python + + from pandas import read_csv + tips_data = read_csv('tips.csv') + + from the directory where you downloaded the file. + +We import the rplot API: + +.. ipython:: python + + import pandas.tools.rplot as rplot + +Examples +~~~~~~~~ + +RPlot is a flexible API for producing Trellis plots. These plots allow you to arrange data in a rectangular grid by values of certain attributes. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot1_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +In the example above, data from the tips data set is arranged by the attributes 'sex' and 'smoker'. Since both of those attributes can take on one of two values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomDensity()) + + @savefig rplot2_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +Example above is the same as previous except the plot is set to kernel density estimation. This shows how easy it is to have different plots for the same Trellis structure. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomPolyFit(degree=2)) + + @savefig rplot3_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +The plot above shows that it is possible to have two or more plots for the same data displayed on the same Trellis grid cell. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomDensity2D()) + + @savefig rplot4_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +Above is a similar plot but with 2D kernel density estimation plot superimposed. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', '.'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot5_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +It is possible to only use one attribute for grouping data. The example above only uses 'sex' attribute. If the second grouping attribute is not specified, the plots will be arranged in a column. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot6_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +If the first grouping attribute is not specified the plots will be arranged in a row. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + plot = rplot.RPlot(tips_data, x='tip', y='total_bill') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) + + @savefig rplot7_tips.png + plot.render(plt.gcf()) + +.. ipython:: python + :suppress: + + plt.close('all') + +As shown above, scatter plots are also possible. Scatter plots allow you to map various data attributes to graphical properties of the plot. In the example above the colour and shape of the scatter plot graphical objects is mapped to 'day' and 'size' attributes respectively. You use scale objects to specify these mappings. The list of scale classes is given below with initialization arguments for quick reference. + + +Scales +~~~~~~ + +:: + + ScaleGradient(column, colour1, colour2) + +This one allows you to map an attribute (specified by parameter column) value to the colour of a graphical object. The larger the value of the attribute the closer the colour will be to colour2, the smaller the value, the closer it will be to colour1. + +:: + + ScaleGradient2(column, colour1, colour2, colour3) + +The same as ScaleGradient but interpolates linearly between three colours instead of two. + +:: + + ScaleSize(column, min_size, max_size, transform) + +Map attribute value to size of the graphical object. Parameter min_size (default 5.0) is the minimum size of the graphical object, max_size (default 100.0) is the maximum size and transform is a one argument function that will be used to transform the attribute value (defaults to lambda x: x). + +:: + + ScaleShape(column) + +Map the shape of the object to attribute value. The attribute has to be categorical. + +:: + + ScaleRandomColour(column) + +Assign a random colour to a value of categorical attribute specified by column. diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 79093b4bbddc8..6ec72ad3a951c 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -7,6 +7,7 @@ import numpy as np from pandas import * + import pandas as pd randn = np.random.randn np.set_printoptions(precision=4, suppress=True) options.display.max_rows = 15 @@ -17,44 +18,46 @@ What's New These are new features and improvements of note in each release. -.. include:: v0.15.0.txt +.. include:: whatsnew/v0.15.1.txt -.. include:: v0.14.1.txt +.. include:: whatsnew/v0.15.0.txt -.. include:: v0.14.0.txt +.. include:: whatsnew/v0.14.1.txt -.. include:: v0.13.1.txt +.. include:: whatsnew/v0.14.0.txt -.. include:: v0.13.0.txt +.. include:: whatsnew/v0.13.1.txt -.. include:: v0.12.0.txt +.. include:: whatsnew/v0.13.0.txt -.. include:: v0.11.0.txt +.. include:: whatsnew/v0.12.0.txt -.. include:: v0.10.1.txt +.. include:: whatsnew/v0.11.0.txt -.. include:: v0.10.0.txt +.. include:: whatsnew/v0.10.1.txt -.. include:: v0.9.1.txt +.. include:: whatsnew/v0.10.0.txt -.. include:: v0.9.0.txt +.. include:: whatsnew/v0.9.1.txt -.. include:: v0.8.1.txt +.. include:: whatsnew/v0.9.0.txt -.. include:: v0.8.0.txt +.. include:: whatsnew/v0.8.1.txt -.. include:: v0.7.3.txt +.. include:: whatsnew/v0.8.0.txt -.. include:: v0.7.2.txt +.. include:: whatsnew/v0.7.3.txt -.. include:: v0.7.1.txt +.. include:: whatsnew/v0.7.2.txt -.. include:: v0.7.0.txt +.. include:: whatsnew/v0.7.1.txt -.. include:: v0.6.1.txt +.. include:: whatsnew/v0.7.0.txt -.. include:: v0.6.0.txt +.. include:: whatsnew/v0.6.1.txt -.. include:: v0.5.0.txt +.. include:: whatsnew/v0.6.0.txt -.. include:: v0.4.x.txt +.. include:: whatsnew/v0.5.0.txt + +.. include:: whatsnew/v0.4.x.txt diff --git a/doc/source/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt similarity index 100% rename from doc/source/v0.10.0.txt rename to doc/source/whatsnew/v0.10.0.txt diff --git a/doc/source/v0.10.1.txt b/doc/source/whatsnew/v0.10.1.txt similarity index 100% rename from doc/source/v0.10.1.txt rename to doc/source/whatsnew/v0.10.1.txt diff --git a/doc/source/v0.11.0.txt b/doc/source/whatsnew/v0.11.0.txt similarity index 98% rename from doc/source/v0.11.0.txt rename to doc/source/whatsnew/v0.11.0.txt index 3a56794151b1e..befdf848ad23b 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/whatsnew/v0.11.0.txt @@ -50,8 +50,7 @@ three types of multi-axis indexing. is interpreted as position based or label based, it's usually better to be explicit and use ``.iloc`` or ``.loc``. - See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and - :ref:`Fallback Indexing ` + See more at :ref:`Advanced Indexing ` and :ref:`Advanced Hierarchical `. Selection Deprecations diff --git a/doc/source/v0.12.0.txt b/doc/source/whatsnew/v0.12.0.txt similarity index 100% rename from doc/source/v0.12.0.txt rename to doc/source/whatsnew/v0.12.0.txt diff --git a/doc/source/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt similarity index 99% rename from doc/source/v0.13.0.txt rename to doc/source/whatsnew/v0.13.0.txt index ac0a14f45b69e..78239eef1b98f 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -460,7 +460,7 @@ Enhancements get_dummies([1, 2, np.nan], dummy_na=True) -- ``timedelta64[ns]`` operations. See :ref:`the docs`. +- ``timedelta64[ns]`` operations. See :ref:`the docs`. .. warning:: @@ -479,7 +479,7 @@ Enhancements A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This - is frequency conversion. See :ref:`the docs` for the docs. + is frequency conversion. See :ref:`the docs` for the docs. .. ipython:: python diff --git a/doc/source/v0.13.1.txt b/doc/source/whatsnew/v0.13.1.txt similarity index 98% rename from doc/source/v0.13.1.txt rename to doc/source/whatsnew/v0.13.1.txt index b48f555f9691a..64ca1612f00c1 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/whatsnew/v0.13.1.txt @@ -121,9 +121,10 @@ API changes .. ipython:: python - df = DataFrame({'col':['foo', 0, np.nan]}).sort() + df = DataFrame({'col':['foo', 0, np.nan]}) df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) - df.equals(df) + df.equals(df2) + df.equals(df2.sort()) import pandas.core.common as com com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) diff --git a/doc/source/v0.14.0.txt b/doc/source/whatsnew/v0.14.0.txt similarity index 99% rename from doc/source/v0.14.0.txt rename to doc/source/whatsnew/v0.14.0.txt index 96ab3d1e58d5c..e2f96f204edab 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/whatsnew/v0.14.0.txt @@ -470,7 +470,7 @@ You can use ``slice(None)`` to select all the contents of *that* level. You do n As usual, **both sides** of the slicers are included as this is label indexing. -See :ref:`the docs` +See :ref:`the docs` See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`, :issue:`7106`) .. warning:: diff --git a/doc/source/v0.14.1.txt b/doc/source/whatsnew/v0.14.1.txt similarity index 100% rename from doc/source/v0.14.1.txt rename to doc/source/whatsnew/v0.14.1.txt diff --git a/doc/source/whatsnew/v0.15.0.txt b/doc/source/whatsnew/v0.15.0.txt new file mode 100644 index 0000000000000..c8c7ed3b5011e --- /dev/null +++ b/doc/source/whatsnew/v0.15.0.txt @@ -0,0 +1,1126 @@ +.. _whatsnew_0150: + +v0.15.0 (October 18, 2014) +-------------------------- + +This is a major release from 0.14.1 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +.. warning:: + + pandas >= 0.15.0 will no longer support compatibility with NumPy versions < + 1.7.0. If you want to use the latest versions of pandas, please upgrade to + NumPy >= 1.7.0 (:issue:`7711`) + +- Highlights include: + + - The ``Categorical`` type was integrated as a first-class pandas type, see :ref:`here ` + - New scalar type ``Timedelta``, and a new index type ``TimedeltaIndex``, see :ref:`here ` + - New DataFrame default display for ``df.info()`` to include memory usage, see :ref:`Memory Usage ` + - New datetimelike properties accessor ``.dt`` for Series, see :ref:`Datetimelike Properties ` + - Split indexing documentation into :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing ` + - Split out string methods documentation into :ref:`Working with Text Data ` + - ``read_csv`` will now by default ignore blank lines when parsing, see :ref:`here ` + - API change in using Indexes in set operations, see :ref:`here ` + - Internal refactoring of the ``Index`` class to no longer sub-class ``ndarray``, see :ref:`Internal Refactoring ` + - dropping support for ``PyTables`` less than version 3.0.0, and ``numexpr`` less than version 2.1 (:issue:`7990`) + +- :ref:`Other Enhancements ` + +- :ref:`API Changes ` + +- :ref:`Timezone API Change ` + +- :ref:`Rolling/Expanding Moments API Changes ` + +- :ref:`Performance Improvements ` + +- :ref:`Deprecations ` + +- :ref:`Bug Fixes ` + +.. warning:: + + In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be + a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) + +.. warning:: + + The refactorings in :class:`~pandas.Categorical` changed the two argument constructor from + "codes/labels and levels" to "values and levels". This can lead to subtle bugs. If you use + :class:`~pandas.Categorical` directly, please audit your code before updating to this pandas + version and change it to use the :meth:`~pandas.Categorical.from_codes` constructor. See more on ``Categorical`` :ref:`here ` + +.. _whatsnew_0150.api: + +API changes +~~~~~~~~~~~ +- :func:`describe` on mixed-types DataFrames is more flexible. Type-based column filtering is now possible via the ``include``/``exclude`` arguments. + See the :ref:`docs ` (:issue:`8164`). + + .. ipython:: python + + df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, + 'catB': ['a', 'b', 'c', 'd'] * 6, + 'numC': np.arange(24), + 'numD': np.arange(24.) + .5}) + df.describe(include=["object"]) + df.describe(include=["number", "object"], exclude=["float"]) + + Requesting all columns is possible with the shorthand 'all' + + .. ipython:: python + + df.describe(include='all') + + Without those arguments, 'describe` will behave as before, including only numerical columns or, if none are, only categorical columns. See also the :ref:`docs ` + +- Passing multiple levels to :meth:`~pandas.DataFrame.stack()` will now work when multiple level + numbers are passed (:issue:`7660`), and will raise a ``ValueError`` when the + levels aren't all level names or all level numbers. See + :ref:`Reshaping by stacking and unstacking `. + +- :func:`set_names`, :func:`set_labels`, and :func:`set_levels` methods now take an optional ``level`` keyword argument to all modification of specific level(s) of a MultiIndex. Additionally :func:`set_names` now accepts a scalar string value when operating on an ``Index`` or on a specific level of a ``MultiIndex`` (:issue:`7792`) + + .. ipython:: python + + idx = MultiIndex.from_product([['a'], range(3), list("pqr")], names=['foo', 'bar', 'baz']) + idx.set_names('qux', level=0) + idx.set_names(['qux','baz'], level=[0,1]) + idx.set_levels(['a','b','c'], level='bar') + idx.set_levels([['a','b','c'],[1,2,3]], level=[1,2]) + +- Raise a ``ValueError`` in ``df.to_hdf`` with 'fixed' format, if ``df`` has non-unique columns as the resulting file will be broken (:issue:`7761`) + +.. _whatsnew_0150.blanklines: + +- Made both the C-based and Python engines for `read_csv` and `read_table` ignore empty lines in input as well as + whitespace-filled lines, as long as ``sep`` is not whitespace. This is an API change + that can be controlled by the keyword parameter ``skip_blank_lines``. See :ref:`the docs ` (:issue:`4466`) + +- Bug in passing a ``DatetimeIndex`` with a timezone that was not being retained in DataFrame construction from a dict (:issue:`7822`) + + In prior versions this would drop the timezone. + + .. ipython:: python + + i = date_range('1/1/2011', periods=3, freq='10s', tz = 'US/Eastern') + i + df = DataFrame( {'a' : i } ) + df + df.dtypes + + This behavior is unchanged. + + .. ipython:: python + + df = DataFrame( ) + df['a'] = i + df + df.dtypes + +- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`, :issue:`7950`) + + .. code-block:: python + + In [1]: df = DataFrame(np.arange(0,9), columns=['count']) + + In [2]: df['group'] = 'b' + + In [3]: df.iloc[0:5]['group'] = 'a' + /usr/local/bin/ipython:1: SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_indexer,col_indexer] = value instead + + See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + +- The ``infer_types`` argument to :func:`~pandas.io.html.read_html` now has no + effect (:issue:`7762`, :issue:`7032`). + +- ``DataFrame.to_stata`` and ``StataWriter`` check string length for + compatibility with limitations imposed in dta files where fixed-width + strings must contain 244 or fewer characters. Attempting to write Stata + dta files with strings longer than 244 characters raises a ``ValueError``. (:issue:`7858`) + +- ``read_stata`` and ``StataReader`` can import missing data information into a + ``DataFrame`` by setting the argument ``convert_missing`` to ``True``. When + using this options, missing values are returned as ``StataMissingValue`` + objects and columns containing missing values have ``object`` data type. (:issue:`8045`) + +- ``Index.isin`` now supports a ``level`` argument to specify which index level + to use for membership tests (:issue:`7892`, :issue:`7890`) + + .. code-block:: python + + In [1]: idx = MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]) + + In [2]: idx.values + Out[2]: array([(0, 'a'), (0, 'b'), (0, 'c'), (1, 'a'), (1, 'b'), (1, 'c')], dtype=object) + + In [3]: idx.isin(['a', 'c', 'e'], level=1) + Out[3]: array([ True, False, True, True, False, True], dtype=bool) + +- ``merge``, ``DataFrame.merge``, and ``ordered_merge`` now return the same type + as the ``left`` argument. (:issue:`7737`) + +- Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. +- Boxplot from ``DataFrame.plot`` with ``kind='box'`` (:issue:`7998`), See :ref:`the docs`. +- Consistency when indexing with ``.loc`` and a list-like indexer when no values are found. + + .. ipython:: python + + df = DataFrame([['a'],['b']],index=[1,2]) + df + + In prior versions there was a difference in these two constructs: + + - ``df.loc[[3]]`` would return a frame reindexed by 3 (with all ``np.nan`` values) + - ``df.loc[[3],:]`` would raise ``KeyError``. + + Both will now raise a ``KeyError``. The rule is that *at least 1* indexer must be found when using a list-like and ``.loc`` (:issue:`7999`) + + Furthermore in prior versions these were also different: + + - ``df.loc[[1,3]]`` would return a frame reindexed by [1,3] + - ``df.loc[[1,3],:]`` would raise ``KeyError``. + + Both will now return a frame reindex by [1,3]. E.g. + + .. ipython:: python + + df.loc[[1,3]] + df.loc[[1,3],:] + + This can also be seen in multi-axis indexing with a ``Panel``. + + .. ipython:: python + + p = Panel(np.arange(2*3*4).reshape(2,3,4), + items=['ItemA','ItemB'], + major_axis=[1,2,3], + minor_axis=['A','B','C','D']) + p + + The following would raise ``KeyError`` prior to 0.15.0: + + .. ipython:: python + + p.loc[['ItemA','ItemD'],:,'D'] + + Furthermore, ``.loc`` will raise If no values are found in a multi-index with a list-like indexer: + + .. ipython:: python + :okexcept: + + s = Series(np.arange(3,dtype='int64'), + index=MultiIndex.from_product([['A'],['foo','bar','baz']], + names=['one','two']) + ).sortlevel() + s + try: + s.loc[['D']] + except KeyError as e: + print("KeyError: " + str(e)) + +- ``Index`` now supports ``duplicated`` and ``drop_duplicates``. (:issue:`4060`) + + .. ipython:: python + + idx = Index([1, 2, 3, 4, 1, 2]) + idx + idx.duplicated() + idx.drop_duplicates() + +- Assigning values to ``None`` now considers the dtype when choosing an 'empty' value (:issue:`7941`). + + Previously, assigning to ``None`` in numeric containers changed the + dtype to object (or errored, depending on the call). It now uses + ``NaN``: + + .. ipython:: python + + s = Series([1, 2, 3]) + s.loc[0] = None + s + + ``NaT`` is now used similarly for datetime containers. + + For object containers, we now preserve ``None`` values (previously these + were converted to ``NaN`` values). + + .. ipython:: python + + s = Series(["a", "b", "c"]) + s.loc[0] = None + s + + To insert a ``NaN``, you must explicitly use ``np.nan``. See the :ref:`docs `. + +- Previously an enlargement with a mixed-dtype frame would act unlike ``.append`` which will preserve dtypes (related :issue:`2578`, :issue:`8176`): + + .. ipython:: python + + df = DataFrame([[True, 1],[False, 2]], + columns=["female","fitness"]) + df + df.dtypes + + # dtypes are now preserved + df.loc[2] = df.loc[1] + df + df.dtypes + +- In prior versions, updating a pandas object inplace would not reflect in other python references to this object. (:issue:`8511`,:issue:`5104`) + + .. ipython:: python + + s = Series([1, 2, 3]) + s2 = s + s += 1.5 + + Behavior prior to v0.15.0 + + .. code-block:: python + + + # the original object + In [5]: s + Out[5]: + 0 2.5 + 1 3.5 + 2 4.5 + dtype: float64 + + + # a reference to the original object + In [7]: s2 + Out[7]: + 0 1 + 1 2 + 2 3 + dtype: int64 + + This is now the correct behavior + + .. ipython:: python + + # the original object + s + + # a reference to the original object + s2 + +- ``Series.to_csv()`` now returns a string when ``path=None``, matching the behaviour of ``DataFrame.to_csv()`` (:issue:`8215`). + +- ``read_hdf`` now raises ``IOError`` when a file that doesn't exist is passed in. Previously, a new, empty file was created, and a ``KeyError`` raised (:issue:`7715`). + +- ``DataFrame.info()`` now ends its output with a newline character (:issue:`8114`) +- add ``copy=True`` argument to ``pd.concat`` to enable pass thru of complete blocks (:issue:`8252`) + +- Added support for numpy 1.8+ data types (``bool_``, ``int_``, ``float_``, ``string_``) for conversion to R dataframe (:issue:`8400`) +- Concatenating no objects will now raise a ``ValueError`` rather than a bare ``Exception``. +- Merge errors will now be sub-classes of ``ValueError`` rather than raw ``Exception`` (:issue:`8501`) +- ``DataFrame.plot`` and ``Series.plot`` keywords are now have consistent orders (:issue:`8037`) + +.. _whatsnew_0150.memory: + +Memory Usage +~~~~~~~~~~~~~ + +Implemented methods to find memory usage of a DataFrame. See the :ref:`FAQ ` for more. (:issue:`6852`). + +A new display option ``display.memory_usage`` (see :ref:`options`) sets the default behavior of the ``memory_usage`` argument in the ``df.info()`` method. By default ``display.memory_usage`` is ``True``. + +.. ipython:: python + + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + n = 5000 + data = dict([ (t, np.random.randint(100, size=n).astype(t)) + for t in dtypes]) + df = DataFrame(data) + df['categorical'] = df['object'].astype('category') + + df.info() + +Additionally :meth:`~pandas.DataFrame.memory_usage` is an available method for a dataframe object which returns the memory usage of each column. + +.. ipython:: python + + df.memory_usage(index=True) + +.. _whatsnew_0150.dt: + +.dt accessor +~~~~~~~~~~~~ + +``Series`` has gained an accessor to succinctly return datetime like properties for the *values* of the Series, if its a datetime/period like Series. (:issue:`7207`) +This will return a Series, indexed like the existing Series. See the :ref:`docs ` + +.. ipython:: python + + # datetime + s = Series(date_range('20130101 09:10:12',periods=4)) + s + s.dt.hour + s.dt.second + s.dt.day + s.dt.freq + +This enables nice expressions like this: + +.. ipython:: python + + s[s.dt.day==2] + +You can easily produce tz aware transformations: + +.. ipython:: python + + stz = s.dt.tz_localize('US/Eastern') + stz + stz.dt.tz + +You can also chain these types of operations: + +.. ipython:: python + + s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + +The ``.dt`` accessor works for period and timedelta dtypes. + +.. ipython:: python + + # period + s = Series(period_range('20130101',periods=4,freq='D')) + s + s.dt.year + s.dt.day + +.. ipython:: python + + # timedelta + s = Series(timedelta_range('1 day 00:00:05',periods=4,freq='s')) + s + s.dt.days + s.dt.seconds + s.dt.components + +.. _whatsnew_0150.tz: + +Timezone API changes +~~~~~~~~~~~~~~~~~~~~ + +- ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time, + previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`) + + .. ipython:: python + + ts = Timestamp('2014-08-01 09:00', tz='US/Eastern') + ts + ts.tz_localize(None) + + didx = DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx + didx.tz_localize(None) + +- ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools + indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT, + 'infer' for inferring DST/non-DST, and 'raise' (default) for an ``AmbiguousTimeError`` to be raised. See :ref:`the docs` for more details (:issue:`7943`) + +- ``DataFrame.tz_localize`` and ``DataFrame.tz_convert`` now accepts an optional ``level`` argument + for localizing a specific level of a MultiIndex (:issue:`7846`) + +- ``Timestamp.tz_localize`` and ``Timestamp.tz_convert`` now raise ``TypeError`` in error cases, rather than ``Exception`` (:issue:`8025`) + +- a timeseries/index localized to UTC when inserted into a Series/DataFrame will preserve the UTC timezone (rather than being a naive ``datetime64[ns]``) as ``object`` dtype (:issue:`8411`) + +- ``Timestamp.__repr__`` displays ``dateutil.tz.tzoffset`` info (:issue:`7907`) + +.. _whatsnew_0150.roll: + +Rolling/Expanding Moments API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- :func:`rolling_min`, :func:`rolling_max`, :func:`rolling_cov`, and :func:`rolling_corr` + now return objects with all ``NaN`` when ``len(arg) < min_periods <= window`` rather + than raising. (This makes all rolling functions consistent in this behavior), (:issue:`7766`) + + Prior to 0.15.0 + + .. ipython:: python + + s = Series([10, 11, 12, 13]) + + .. code-block:: python + + In [15]: rolling_min(s, window=10, min_periods=5) + ValueError: min_periods (5) must be <= window (4) + + New behavior + + .. ipython:: python + + rolling_min(s, window=10, min_periods=5) + +- :func:`rolling_max`, :func:`rolling_min`, :func:`rolling_sum`, :func:`rolling_mean`, :func:`rolling_median`, + :func:`rolling_std`, :func:`rolling_var`, :func:`rolling_skew`, :func:`rolling_kurt`, :func:`rolling_quantile`, + :func:`rolling_cov`, :func:`rolling_corr`, :func:`rolling_corr_pairwise`, + :func:`rolling_window`, and :func:`rolling_apply` with ``center=True`` previously would return a result of the same + structure as the input ``arg`` with ``NaN`` in the final ``(window-1)/2`` entries. + + Now the final ``(window-1)/2`` entries of the result are calculated as if the input ``arg`` were followed + by ``(window-1)/2`` ``NaN`` values (or with shrinking windows, in the case of :func:`rolling_apply`). + (:issue:`7925`, :issue:`8269`) + + Prior behavior (note final value is ``NaN``): + + .. code-block:: python + + In [7]: rolling_sum(Series(range(4)), window=3, min_periods=0, center=True) + Out[7]: + 0 1 + 1 3 + 2 6 + 3 NaN + dtype: float64 + + New behavior (note final value is ``5 = sum([2, 3, NaN])``): + + .. ipython:: python + + rolling_sum(Series(range(4)), window=3, min_periods=0, center=True) + +- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that + the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those + calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) + + .. ipython:: python + + s = Series([10.5, 8.8, 11.4, 9.7, 9.3]) + + Behavior prior to 0.15.0: + + .. code-block:: python + + In [39]: rolling_window(s, window=3, win_type='triang', center=True) + Out[39]: + 0 NaN + 1 6.583333 + 2 6.883333 + 3 6.683333 + 4 NaN + dtype: float64 + + New behavior + + .. ipython:: python + + rolling_window(s, window=3, win_type='triang', center=True) + +- Removed ``center`` argument from :func:`expanding_max`, :func:`expanding_min`, :func:`expanding_sum`, + :func:`expanding_mean`, :func:`expanding_median`, :func:`expanding_std`, :func:`expanding_var`, + :func:`expanding_skew`, :func:`expanding_kurt`, :func:`expanding_quantile`, :func:`expanding_count`, + :func:`expanding_cov`, :func:`expanding_corr`, :func:`expanding_corr_pairwise`, and :func:`expanding_apply`, + as the results produced when ``center=True`` did not make much sense. (:issue:`7925`) + +- Added optional ``ddof`` argument to :func:`expanding_cov` and :func:`rolling_cov`. + The default value of ``1`` is backwards-compatible. (:issue:`8279`) + +- Documented the ``ddof`` argument to :func:`expanding_var`, :func:`expanding_std`, + :func:`rolling_var`, and :func:`rolling_std`. These functions' support of a + ``ddof`` argument (with a default value of ``1``) was previously undocumented. (:issue:`8064`) + +- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` + now interpret ``min_periods`` in the same manner that the :func:`rolling_*()` and :func:`expanding_*()` functions do: + a given result entry will be ``NaN`` if the (expanding, in this case) window does not contain + at least ``min_periods`` values. The previous behavior was to set to ``NaN`` the ``min_periods`` entries + starting with the first non- ``NaN`` value. (:issue:`7977`) + + Prior behavior (note values start at index ``2``, which is ``min_periods`` after index ``0`` + (the index of the first non-empty value)): + + .. ipython:: python + + s = Series([1, None, None, None, 2, 3]) + + .. code-block:: python + + In [51]: ewma(s, com=3., min_periods=2) + Out[51]: + 0 NaN + 1 NaN + 2 1.000000 + 3 1.000000 + 4 1.571429 + 5 2.189189 + dtype: float64 + + New behavior (note values start at index ``4``, the location of the 2nd (since ``min_periods=2``) non-empty value): + + .. ipython:: python + + ewma(s, com=3., min_periods=2) + +- :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` + now have an optional ``adjust`` argument, just like :func:`ewma` does, + affecting how the weights are calculated. + The default value of ``adjust`` is ``True``, which is backwards-compatible. + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`) + +- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` + now have an optional ``ignore_na`` argument. + When ``ignore_na=False`` (the default), missing values are taken into account in the weights calculation. + When ``ignore_na=True`` (which reproduces the pre-0.15.0 behavior), missing values are ignored in the weights calculation. + (:issue:`7543`) + + .. ipython:: python + + ewma(Series([None, 1., 8.]), com=2.) + ewma(Series([1., None, 8.]), com=2., ignore_na=True) # pre-0.15.0 behavior + ewma(Series([1., None, 8.]), com=2., ignore_na=False) # new default + + .. warning:: + + By default (``ignore_na=False``) the :func:`ewm*()` functions' weights calculation + in the presence of missing values is different than in pre-0.15.0 versions. + To reproduce the pre-0.15.0 calculation of weights in the presence of missing values + one must specify explicitly ``ignore_na=True``. + +- Bug in :func:`expanding_cov`, :func:`expanding_corr`, :func:`rolling_cov`, :func:`rolling_cor`, :func:`ewmcov`, and :func:`ewmcorr` + returning results with columns sorted by name and producing an error for non-unique columns; + now handles non-unique columns and returns columns in original order + (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`) +- Bug in :func:`rolling_count` and :func:`expanding_*()` functions unnecessarily producing error message for zero-length data (:issue:`8056`) +- Bug in :func:`rolling_apply` and :func:`expanding_apply` interpreting ``min_periods=0`` as ``min_periods=1`` (:issue:`8080`) +- Bug in :func:`expanding_std` and :func:`expanding_var` for a single value producing a confusing error message (:issue:`7900`) +- Bug in :func:`rolling_std` and :func:`rolling_var` for a single value producing ``0`` rather than ``NaN`` (:issue:`7900`) + +- Bug in :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, and :func:`ewmcov` + calculation of de-biasing factors when ``bias=False`` (the default). + Previously an incorrect constant factor was used, based on ``adjust=True``, ``ignore_na=True``, + and an infinite number of observations. + Now a different factor is used for each entry, based on the actual weights + (analogous to the usual ``N/(N-1)`` factor). + In particular, for a single point a value of ``NaN`` is returned when ``bias=False``, + whereas previously a value of (approximately) ``0`` was returned. + + For example, consider the following pre-0.15.0 results for ``ewmvar(..., bias=False)``, + and the corresponding debiasing factors: + + .. ipython:: python + + s = Series([1., 2., 0., 4.]) + + .. code-block:: python + + In [89]: ewmvar(s, com=2., bias=False) + Out[89]: + 0 -2.775558e-16 + 1 3.000000e-01 + 2 9.556787e-01 + 3 3.585799e+00 + dtype: float64 + + In [90]: ewmvar(s, com=2., bias=False) / ewmvar(s, com=2., bias=True) + Out[90]: + 0 1.25 + 1 1.25 + 2 1.25 + 3 1.25 + dtype: float64 + + Note that entry ``0`` is approximately 0, and the debiasing factors are a constant 1.25. + By comparison, the following 0.15.0 results have a ``NaN`` for entry ``0``, + and the debiasing factors are decreasing (towards 1.25): + + .. ipython:: python + + ewmvar(s, com=2., bias=False) + ewmvar(s, com=2., bias=False) / ewmvar(s, com=2., bias=True) + + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`) + +.. _whatsnew_0150.refactoring: + +Internal Refactoring +~~~~~~~~~~~~~~~~~~~~ + +In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` +but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This change allows very easy sub-classing and creation of new index types. This should be +a transparent change with only very limited API implications (:issue:`5080`, :issue:`7439`, :issue:`7796`, :issue:`8024`, :issue:`8367`, :issue:`7997`, :issue:`8522`) + +- you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` +- when plotting with a ``PeriodIndex``. The ``matplotlib`` internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex``. (this is similar to how a ``DatetimeIndex`` passes arrays of ``datetimes`` now) +- MultiIndexes will now raise similary to other pandas objects w.r.t. truth testing, See :ref:`here ` (:issue:`7897`). + +.. _whatsnew_0150.cat: + +Categoricals in Series/DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new +methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, +:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, +:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`). + +For full docs, see the :ref:`categorical introduction ` and the +:ref:`API documentation `. + +.. ipython:: python + + df = DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + + df["grade"] = df["raw_grade"].astype("category") + df["grade"] + + # Rename the categories + df["grade"].cat.categories = ["very good", "good", "very bad"] + + # Reorder the categories and simultaneously add the missing categories + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] + df.sort("grade") + df.groupby("grade").size() + +- ``pandas.core.group_agg`` and ``pandas.core.factor_agg`` were removed. As an alternative, construct + a dataframe and use ``df.groupby().agg()``. + +- Supplying "codes/labels and levels" to the :class:`~pandas.Categorical` constructor is not + supported anymore. Supplying two arguments to the constructor is now interpreted as + "values and levels". Please change your code to use the :meth:`~pandas.Categorical.from_codes` + constructor. + +- The ``Categorical.labels`` attribute was renamed to ``Categorical.codes`` and is read + only. If you want to manipulate codes, please use one of the + :ref:`API methods on Categoricals `. + +.. _whatsnew_0150.timedeltaindex: + +TimedeltaIndex/Scalar +~~~~~~~~~~~~~~~~~~~~~ + +We introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, +but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. +This type is very similar to how ``Timestamp`` works for ``datetimes``. It is a nice-API box for the type. See the :ref:`docs `. +(:issue:`3009`, :issue:`4533`, :issue:`8209`, :issue:`8187`, :issue:`8190`, :issue:`7869`, :issue:`7661`, :issue:`8345`, :issue:`8471`) + +.. warning:: + + ``Timedelta`` scalars (and ``TimedeltaIndex``) component fields are *not the same* as the component fields on a ``datetime.timedelta`` object. For example, ``.seconds`` on a ``datetime.timedelta`` object returns the total number of seconds combined between ``hours``, ``minutes`` and ``seconds``. In contrast, the pandas ``Timedelta`` breaks out hours, minutes, microseconds and nanoseconds separately. + + .. ipython:: python + + # Timedelta accessor + tds = Timedelta('31 days 5 min 3 sec') + tds.minutes + tds.seconds + + # datetime.timedelta accessor + # this is 5 minutes * 60 + 3 seconds + tds.to_pytimedelta().seconds + +.. warning:: + + Prior to 0.15.0 ``pd.to_timedelta`` would return a ``Series`` for list-like/Series input, and a ``np.timedelta64`` for scalar input. + It will now return a ``TimedeltaIndex`` for list-like input, ``Series`` for Series input, and ``Timedelta`` for scalar input. + + The arguments to ``pd.to_timedelta`` are now ``(arg,unit='ns',box=True,coerce=False)``, previously were ``(arg,box=True,unit='ns')`` as these are more logical. + +Consruct a scalar + +.. ipython:: python + + Timedelta('1 days 06:05:01.00003') + Timedelta('15.5us') + Timedelta('1 hour 15.5us') + + # negative Timedeltas have this string repr + # to be more consistent with datetime.timedelta conventions + Timedelta('-1us') + + # a NaT + Timedelta('nan') + +Access fields for a ``Timedelta`` + +.. ipython:: python + + td = Timedelta('1 hour 3m 15.5us') + td.hours + td.minutes + td.microseconds + td.nanoseconds + +Construct a ``TimedeltaIndex`` + +.. ipython:: python + :suppress: + + import datetime + from datetime import timedelta + +.. ipython:: python + + TimedeltaIndex(['1 days','1 days, 00:00:05', + np.timedelta64(2,'D'),timedelta(days=2,seconds=2)]) + +Constructing a ``TimedeltaIndex`` with a regular range + +.. ipython:: python + + timedelta_range('1 days',periods=5,freq='D') + timedelta_range(start='1 days',end='2 days',freq='30T') + +You can now use a ``TimedeltaIndex`` as the index of a pandas object + +.. ipython:: python + + s = Series(np.arange(5), + index=timedelta_range('1 days',periods=5,freq='s')) + s + +You can select with partial string selections + +.. ipython:: python + + s['1 day 00:00:02'] + s['1 day':'1 day 00:00:02'] + +Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow certain combination operations that are ``NaT`` preserving: + +.. ipython:: python + + tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days']) + tdi.tolist() + dti = date_range('20130101',periods=3) + dti.tolist() + + (dti + tdi).tolist() + (dti - tdi).tolist() + +- iteration of a ``Series`` e.g. ``list(Series(...))`` of ``timedelta64[ns]`` would prior to v0.15.0 return ``np.timedelta64`` for each element. These will now be wrapped in ``Timedelta``. + +.. _whatsnew_0150.prior_deprecations: + +Prior Version Deprecations/Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Remove ``DataFrame.delevel`` method in favor of ``DataFrame.reset_index`` + +.. _whatsnew_0150.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- The ``outtype`` argument to ``pd.DataFrame.to_dict`` has been deprecated in favor of ``orient``. (:issue:`7840`) +- The ``convert_dummies`` method has been deprecated in favor of + ``get_dummies`` (:issue:`8140`) +- The ``infer_dst`` argument in ``tz_localize`` will be deprecated in favor of + ``ambiguous`` to allow for more flexibility in dealing with DST transitions. + Replace ``infer_dst=True`` with ``ambiguous='infer'`` for the same behavior (:issue:`7943`). + See :ref:`the docs` for more details. +- The top-level ``pd.value_range`` has been deprecated and can be replaced by ``.describe()`` (:issue:`8481`) + +.. _whatsnew_0150.index_set_ops: + +- The ``Index`` set operations ``+`` and ``-`` were deprecated in order to provide these for numeric type operations on certain index types. ``+`` can be replace by ``.union()`` or ``|``, and ``-`` by ``.difference()``. Further the method name ``Index.diff()`` is deprecated and can be replaced by ``Index.difference()`` (:issue:`8226`) + + .. code-block:: python + + # + + Index(['a','b','c']) + Index(['b','c','d']) + + # should be replaced by + Index(['a','b','c']).union(Index(['b','c','d'])) + + .. code-block:: python + + # - + Index(['a','b','c']) - Index(['b','c','d']) + + # should be replaced by + Index(['a','b','c']).difference(Index(['b','c','d'])) + +.. _whatsnew_0150.enhancements: + +Enhancements +~~~~~~~~~~~~ + +- Added support for a ``chunksize`` parameter to ``to_sql`` function. This allows DataFrame to be written in chunks and avoid packet-size overflow errors (:issue:`8062`). +- Added support for a ``chunksize`` parameter to ``read_sql`` function. Specifying this argument will return an iterator through chunks of the query result (:issue:`2908`). +- Added support for writing ``datetime.date`` and ``datetime.time`` object columns with ``to_sql`` (:issue:`6932`). +- Added support for specifying a ``schema`` to read from/write to with ``read_sql_table`` and ``to_sql`` (:issue:`7441`, :issue:`7952`). + For example: + + .. code-block:: python + + df.to_sql('table', engine, schema='other_schema') + pd.read_sql_table('table', engine, schema='other_schema') + +- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`). +- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`). + +- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`) +- Added conversion option when importing Stata files (:issue:`8527`) + +- Added ``layout`` keyword to ``DataFrame.plot``. You can pass a tuple of ``(rows, columns)``, one of which can be ``-1`` to automatically infer (:issue:`6667`, :issue:`8071`). +- Allow to pass multiple axes to ``DataFrame.plot``, ``hist`` and ``boxplot`` (:issue:`5353`, :issue:`6970`, :issue:`7069`) +- Added support for ``c``, ``colormap`` and ``colorbar`` arguments for ``DataFrame.plot`` with ``kind='scatter'`` (:issue:`7780`) + +- ``read_csv`` now has a keyword parameter ``float_precision`` which specifies which floating-point converter the C engine should use during parsing, see :ref:`here ` (:issue:`8002`, :issue:`8044`) + +- ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`) +- ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`) +- ``pandas.tseries.holiday.Holiday`` now supports a list of offsets in Python3 (:issue:`7070`) +- ``pandas.tseries.holiday.Holiday`` now supports a days_of_week parameter (:issue:`7070`) +- ``GroupBy.nth()`` now supports selecting multiple nth values (:issue:`7910`) + + .. ipython:: python + + business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, 4th, and last date index for each month + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + +- ``Period`` and ``PeriodIndex`` supports addition/subtraction with ``timedelta``-likes (:issue:`7966`) + + If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``Timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added. + + .. ipython:: python + + idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') + idx + idx + pd.offsets.Hour(2) + idx + Timedelta('120m') + + idx = pd.period_range('2014-07', periods=5, freq='M') + idx + idx + pd.offsets.MonthEnd(3) + +- Added ``split`` as an option to the ``orient`` argument in ``pd.DataFrame.to_dict``. (:issue:`7840`) + +- The ``get_dummies`` method can now be used on DataFrames. By default only + catagorical columns are encoded as 0's and 1's, while other columns are + left untouched. + + .. ipython:: python + + df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], + 'C': [1, 2, 3]}) + pd.get_dummies(df) + + + +- Added experimental compatibility with ``openpyxl`` for versions >= 2.0. The ``DataFrame.to_excel`` + method ``engine`` keyword now recognizes ``openpyxl1`` and ``openpyxl2`` + which will explicitly require openpyxl v1 and v2 respectively, failing if + the requested version is not available. The ``openpyxl`` engine is a now a + meta-engine that automatically uses whichever version of openpyxl is + installed. (:issue:`7177`) + +- ``DataFrame.fillna`` can now accept a ``DataFrame`` as a fill value (:issue:`8377`) + +- Added ``searchsorted`` method to ``Series`` objects (:issue:`7447`) + +.. _whatsnew_0150.performance: + +Performance +~~~~~~~~~~~ + +- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) +- Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) +- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) +- Performance improvements in ``StataReader`` when reading large files (:issue:`8040`, :issue:`8073`) +- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`) +- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`) +- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) +- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). +- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`) +- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`) +- Performance improvement for ``MultiIndex.values`` for multi-level indexes containing datetimes (:issue:`8543`) + + + + + + + + + + + + + + + + + + + + +.. _whatsnew_0150.bug_fixes: + +Bug Fixes +~~~~~~~~~ +- Bug in pivot_table, when using margins and a dict aggfunc (:issue:`8349`) +- Bug in ``read_csv`` where ``squeeze=True`` would return a view (:issue:`8217`) +- Bug in checking of table name in ``read_sql`` in certain cases (:issue:`7826`). +- Bug in ``DataFrame.groupby`` where ``Grouper`` does not recognize level when frequency is specified (:issue:`7885`) +- Bug in multiindexes dtypes getting mixed up when DataFrame is saved to SQL table (:issue:`8021`) +- Bug in ``Series`` 0-division with a float and integer operand dtypes (:issue:`7785`) +- Bug in ``Series.astype("unicode")`` not calling ``unicode`` on the values correctly (:issue:`7758`) +- Bug in ``DataFrame.as_matrix()`` with mixed ``datetime64[ns]`` and ``timedelta64[ns]`` dtypes (:issue:`7778`) +- Bug in ``HDFStore.select_column()`` not preserving UTC timezone info when selecting a ``DatetimeIndex`` (:issue:`7777`) +- Bug in ``to_datetime`` when ``format='%Y%m%d'`` and ``coerce=True`` are specified, where previously an object array was returned (rather than + a coerced time-series with ``NaT``), (:issue:`7930`) +- Bug in ``DatetimeIndex`` and ``PeriodIndex`` in-place addition and subtraction cause different result from normal one (:issue:`6527`) +- Bug in adding and subtracting ``PeriodIndex`` with ``PeriodIndex`` raise ``TypeError`` (:issue:`7741`) +- Bug in ``combine_first`` with ``PeriodIndex`` data raises ``TypeError`` (:issue:`3367`) +- Bug in multi-index slicing with missing indexers (:issue:`7866`) +- Bug in multi-index slicing with various edge cases (:issue:`8132`) +- Regression in multi-index indexing with a non-scalar type object (:issue:`7914`) +- Bug in ``Timestamp`` comparisons with ``==`` and ``int64`` dtype (:issue:`8058`) +- Bug in pickles contains ``DateOffset`` may raise ``AttributeError`` when ``normalize`` attribute is reffered internally (:issue:`7748`) +- Bug in ``Panel`` when using ``major_xs`` and ``copy=False`` is passed (deprecation warning fails because of missing ``warnings``) (:issue:`8152`). +- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity + when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`) +- Bug in putting a ``PeriodIndex`` into a ``Series`` would convert to ``int64`` dtype, rather than ``object`` of ``Periods`` (:issue:`7932`) +- Bug in ``HDFStore`` iteration when passing a where (:issue:`8014`) +- Bug in ``DataFrameGroupby.transform`` when transforming with a passed non-sorted key (:issue:`8046`, :issue:`8430`) +- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) +- Bug in inference in a ``MultiIndex`` with ``datetime.date`` inputs (:issue:`7888`) +- Bug in ``get`` where an ``IndexError`` would not cause the default value to be returned (:issue:`7725`) +- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may reset nanosecond (:issue:`7697`) +- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may raise ``AttributeError`` if ``Timestamp`` has ``dateutil`` tzinfo (:issue:`7697`) +- Bug in sorting a multi-index frame with a ``Float64Index`` (:issue:`8017`) +- Bug in inconsistent panel setitem with a rhs of a ``DataFrame`` for alignment (:issue:`7763`) +- Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`) +- Bug in 32-bit platforms with ``Series.shift`` (:issue:`8129`) +- Bug in ``PeriodIndex.unique`` returns int64 ``np.ndarray`` (:issue:`7540`) +- Bug in ``groupby.apply`` with a non-affecting mutation in the function (:issue:`8467`) +- Bug in ``DataFrame.reset_index`` which has ``MultiIndex`` contains ``PeriodIndex`` or ``DatetimeIndex`` with tz raises ``ValueError`` (:issue:`7746`, :issue:`7793`) + + +- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) +- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) +- Bug in ``StataReader`` where strings were always converted to 244 characters-fixed width irrespective of underlying string size (:issue:`7858`) + +- Bug in ``DataFrame.plot`` and ``Series.plot`` may ignore ``rot`` and ``fontsize`` keywords (:issue:`7844`) + + +- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`) +- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`) +- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`) + + + +- Bug in ``GroupBy.transform()`` where int groups with a transform that + didn't preserve the index were incorrectly truncated (:issue:`7972`). + +- Bug in ``groupby`` where callable objects without name attributes would take the wrong path, + and produce a ``DataFrame`` instead of a ``Series`` (:issue:`7929`) + +- Bug in ``groupby`` error message when a DataFrame grouping column is duplicated (:issue:`7511`) + +- Bug in ``read_html`` where the ``infer_types`` argument forced coercion of + date-likes incorrectly (:issue:`7762`, :issue:`7032`). + + +- Bug in ``Series.str.cat`` with an index which was filtered as to not include the first item (:issue:`7857`) + + +- Bug in ``Timestamp`` cannot parse ``nanosecond`` from string (:issue:`7878`) +- Bug in ``Timestamp`` with string offset and ``tz`` results incorrect (:issue:`7833`) + +- Bug in ``tslib.tz_convert`` and ``tslib.tz_convert_single`` may return different results (:issue:`7798`) +- Bug in ``DatetimeIndex.intersection`` of non-overlapping timestamps with tz raises ``IndexError`` (:issue:`7880`) +- Bug in alignment with TimeOps and non-unique indexes (:issue:`8363`) + + +- Bug in ``GroupBy.filter()`` where fast path vs. slow path made the filter + return a non scalar value that appeared valid but wasn't (:issue:`7870`). +- Bug in ``date_range()``/``DatetimeIndex()`` when the timezone was inferred from input dates yet incorrect + times were returned when crossing DST boundaries (:issue:`7835`, :issue:`7901`). +- Bug in ``to_excel()`` where a negative sign was being prepended to positive infinity and was absent for negative infinity (:issue:`7949`) +- Bug in area plot draws legend with incorrect ``alpha`` when ``stacked=True`` (:issue:`8027`) +- ``Period`` and ``PeriodIndex`` addition/subtraction with ``np.timedelta64`` results in incorrect internal representations (:issue:`7740`) +- Bug in ``Holiday`` with no offset or observance (:issue:`7987`) + +- Bug in ``DataFrame.to_latex`` formatting when columns or index is a ``MultiIndex`` (:issue:`7982`). + +- Bug in ``DateOffset`` around Daylight Savings Time produces unexpected results (:issue:`5175`). + + + + + +- Bug in ``DataFrame.shift`` where empty columns would throw ``ZeroDivisionError`` on numpy 1.7 (:issue:`8019`) + + + + + +- Bug in installation where ``html_encoding/*.html`` wasn't installed and + therefore some tests were not running correctly (:issue:`7927`). + +- Bug in ``read_html`` where ``bytes`` objects were not tested for in + ``_read`` (:issue:`7927`). + +- Bug in ``DataFrame.stack()`` when one of the column levels was a datelike (:issue:`8039`) +- Bug in broadcasting numpy scalars with ``DataFrame`` (:issue:`8116`) + + +- Bug in ``pivot_table`` performed with nameless ``index`` and ``columns`` raises ``KeyError`` (:issue:`8103`) + +- Bug in ``DataFrame.plot(kind='scatter')`` draws points and errorbars with different colors when the color is specified by ``c`` keyword (:issue:`8081`) + + + + +- Bug in ``Float64Index`` where ``iat`` and ``at`` were not testing and were + failing (:issue:`8092`). +- Bug in ``DataFrame.boxplot()`` where y-limits were not set correctly when + producing multiple axes (:issue:`7528`, :issue:`5517`). + +- Bug in ``read_csv`` where line comments were not handled correctly given + a custom line terminator or ``delim_whitespace=True`` (:issue:`8122`). + +- Bug in ``read_html`` where empty tables caused a ``StopIteration`` (:issue:`7575`) +- Bug in casting when setting a column in a same-dtype block (:issue:`7704`) +- Bug in accessing groups from a ``GroupBy`` when the original grouper + was a tuple (:issue:`8121`). +- Bug in ``.at`` that would accept integer indexers on a non-integer index and do fallback (:issue:`7814`) +- Bug with kde plot and NaNs (:issue:`8182`) +- Bug in ``GroupBy.count`` with float32 data type were nan values were not excluded (:issue:`8169`). +- Bug with stacked barplots and NaNs (:issue:`8175`). +- Bug in resample with non evenly divisible offsets (e.g. '7s') (:issue:`8371`) + +- Bug in interpolation methods with the ``limit`` keyword when no values needed interpolating (:issue:`7173`). +- Bug where ``col_space`` was ignored in ``DataFrame.to_string()`` when ``header=False`` (:issue:`8230`). +- Bug with ``DatetimeIndex.asof`` incorrectly matching partial strings and returning the wrong date (:issue:`8245`). +- Bug in plotting methods modifying the global matplotlib rcParams (:issue:`8242`). +- Bug in ``DataFrame.__setitem__`` that caused errors when setting a dataframe column to a sparse array (:issue:`8131`) +- Bug where ``Dataframe.boxplot()`` failed when entire column was empty (:issue:`8181`). +- Bug with messed variables in ``radviz`` visualization (:issue:`8199`). +- Bug in interpolation methods with the ``limit`` keyword when no values needed interpolating (:issue:`7173`). +- Bug where ``col_space`` was ignored in ``DataFrame.to_string()`` when ``header=False`` (:issue:`8230`). +- Bug in ``to_clipboard`` that would clip long column data (:issue:`8305`) +- Bug in ``DataFrame`` terminal display: Setting max_column/max_rows to zero did not trigger auto-resizing of dfs to fit terminal width/height (:issue:`7180`). +- Bug in OLS where running with "cluster" and "nw_lags" parameters did not work correctly, but also did not throw an error + (:issue:`5884`). +- Bug in ``DataFrame.dropna`` that interpreted non-existent columns in the subset argument as the 'last column' (:issue:`8303`) +- Bug in ``Index.intersection`` on non-monotonic non-unique indexes (:issue:`8362`). +- Bug in masked series assignment where mismatching types would break alignment (:issue:`8387`) +- Bug in ``NDFrame.equals`` gives false negatives with dtype=object (:issue:`8437`) +- Bug in assignment with indexer where type diversity would break alignment (:issue:`8258`) +- Bug in ``NDFrame.loc`` indexing when row/column names were lost when target was a list/ndarray (:issue:`6552`) +- Regression in ``NDFrame.loc`` indexing when rows/columns were converted to Float64Index if target was an empty list/ndarray (:issue:`7774`) +- Bug in ``Series`` that allows it to be indexed by a ``DataFrame`` which has unexpected results. Such indexing is no longer permitted (:issue:`8444`) +- Bug in item assignment of a ``DataFrame`` with multi-index columns where right-hand-side columns were not aligned (:issue:`7655`) +- Suppress FutureWarning generated by NumPy when comparing object arrays containing NaN for equality (:issue:`7065`) + +- Bug in ``DataFrame.eval()`` where the dtype of the ``not`` operator (``~``) + was not correctly inferred as ``bool``. diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt new file mode 100644 index 0000000000000..fa1b8b24e75b5 --- /dev/null +++ b/doc/source/whatsnew/v0.15.1.txt @@ -0,0 +1,49 @@ +.. _whatsnew_0151: + +v0.15.1 (November ??, 2014) +----------------------- + +This is a minor release from 0.15.0 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +- Highlights include: + +- :ref:`Enhancements ` +- :ref:`API Changes ` +- :ref:`Performance Improvements ` +- :ref:`Experimental Changes ` +- :ref:`Bug Fixes ` + +.. _whatsnew_0151.api: + +API changes +~~~~~~~~~~~ + + +.. _whatsnew_0151.enhancements: + +Enhancements +~~~~~~~~~~~~ + +- Added option to select columns when importing Stata files (:issue:`7935`) + +- Qualify memory usage in ``DataFrame.info()`` by adding ``+`` if it is a lower bound (:issue:`8578`) + + +.. _whatsnew_0151.performance: + +Performance +~~~~~~~~~~~ + + +.. _whatsnew_0151.experimental: + +Experimental +~~~~~~~~~~~~ + + +.. _whatsnew_0151.bug_fixes: + +Bug Fixes +~~~~~~~~~ diff --git a/doc/source/v0.4.x.txt b/doc/source/whatsnew/v0.4.x.txt similarity index 96% rename from doc/source/v0.4.x.txt rename to doc/source/whatsnew/v0.4.x.txt index 5333bb9ffb157..4717b46a6bca8 100644 --- a/doc/source/v0.4.x.txt +++ b/doc/source/whatsnew/v0.4.x.txt @@ -13,7 +13,7 @@ New Features Series (:issue:`209`, :issue:`203`) - :ref:`Added ` ``Series.align`` method for aligning two series with choice of join method (ENH56_) -- :ref:`Added ` method ``get_level_values`` to +- :ref:`Added ` method ``get_level_values`` to ``MultiIndex`` (:issue:`188`) - Set values in mixed-type ``DataFrame`` objects via ``.ix`` indexing attribute (:issue:`135`) - Added new ``DataFrame`` :ref:`methods ` @@ -28,7 +28,7 @@ New Features - ``DataFrame.rename`` has a new ``copy`` parameter to :ref:`rename ` a DataFrame in place (ENHed_) - :ref:`Enable ` unstacking by name (:issue:`142`) -- :ref:`Enable ` ``sortlevel`` to work by level (:issue:`141`) +- :ref:`Enable ` ``sortlevel`` to work by level (:issue:`141`) Performance Enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.5.0.txt b/doc/source/whatsnew/v0.5.0.txt similarity index 98% rename from doc/source/v0.5.0.txt rename to doc/source/whatsnew/v0.5.0.txt index d0550fd5ef8f3..8b7e4721d136f 100644 --- a/doc/source/v0.5.0.txt +++ b/doc/source/whatsnew/v0.5.0.txt @@ -21,7 +21,7 @@ New Features - :ref:`Added` ``pivot_table`` convenience function to pandas namespace (:issue:`234`) - :ref:`Implemented ` ``Panel.rename_axis`` function (:issue:`243`) - DataFrame will show index level names in console output (:issue:`334`) -- :ref:`Implemented ` ``Panel.take`` +- :ref:`Implemented ` ``Panel.take`` - :ref:`Added` ``set_eng_float_format`` for alternate DataFrame floating point string formatting (ENH61_) - :ref:`Added ` convenience ``set_index`` function for creating a DataFrame index from its existing columns - :ref:`Implemented ` ``groupby`` hierarchical index level name (:issue:`223`) diff --git a/doc/source/v0.6.0.txt b/doc/source/whatsnew/v0.6.0.txt similarity index 100% rename from doc/source/v0.6.0.txt rename to doc/source/whatsnew/v0.6.0.txt diff --git a/doc/source/v0.6.1.txt b/doc/source/whatsnew/v0.6.1.txt similarity index 96% rename from doc/source/v0.6.1.txt rename to doc/source/whatsnew/v0.6.1.txt index 7e593d07f7f2b..a2dab738546f9 100644 --- a/doc/source/v0.6.1.txt +++ b/doc/source/whatsnew/v0.6.1.txt @@ -32,7 +32,7 @@ New features - Add ``Series.from_csv`` function (:issue:`482`) - :ref:`Can pass ` DataFrame/DataFrame and DataFrame/Series to rolling_corr/rolling_cov (GH #462) -- MultiIndex.get_level_values can :ref:`accept the level name ` +- MultiIndex.get_level_values can :ref:`accept the level name ` Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.7.0.txt b/doc/source/whatsnew/v0.7.0.txt similarity index 97% rename from doc/source/v0.7.0.txt rename to doc/source/whatsnew/v0.7.0.txt index bf7acd3820db0..cfba2ad3d05b6 100644 --- a/doc/source/v0.7.0.txt +++ b/doc/source/whatsnew/v0.7.0.txt @@ -33,7 +33,7 @@ New features df = DataFrame(randn(10, 4)) df.apply(lambda x: x.describe()) -- :ref:`Add` ``reorder_levels`` method to Series and +- :ref:`Add` ``reorder_levels`` method to Series and DataFrame (:issue:`534`) - :ref:`Add` dict-like ``get`` function to DataFrame @@ -50,7 +50,7 @@ New features - :ref:`Add ` ``level`` option to binary arithmetic functions on ``DataFrame`` and ``Series`` -- :ref:`Add ` ``level`` option to the ``reindex`` +- :ref:`Add ` ``level`` option to the ``reindex`` and ``align`` methods on Series and DataFrame for broadcasting values across a level (:issue:`542`, :issue:`552`, others) @@ -103,7 +103,7 @@ New features - :ref:`Added ` ``isin`` method to index objects -- :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame. +- :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame. API Changes to integer indexing diff --git a/doc/source/v0.7.1.txt b/doc/source/whatsnew/v0.7.1.txt similarity index 100% rename from doc/source/v0.7.1.txt rename to doc/source/whatsnew/v0.7.1.txt diff --git a/doc/source/v0.7.2.txt b/doc/source/whatsnew/v0.7.2.txt similarity index 100% rename from doc/source/v0.7.2.txt rename to doc/source/whatsnew/v0.7.2.txt diff --git a/doc/source/v0.7.3.txt b/doc/source/whatsnew/v0.7.3.txt similarity index 100% rename from doc/source/v0.7.3.txt rename to doc/source/whatsnew/v0.7.3.txt diff --git a/doc/source/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt similarity index 100% rename from doc/source/v0.8.0.txt rename to doc/source/whatsnew/v0.8.0.txt diff --git a/doc/source/v0.8.1.txt b/doc/source/whatsnew/v0.8.1.txt similarity index 95% rename from doc/source/v0.8.1.txt rename to doc/source/whatsnew/v0.8.1.txt index cecf6f16cdc71..8227bc6bc9c66 100644 --- a/doc/source/v0.8.1.txt +++ b/doc/source/whatsnew/v0.8.1.txt @@ -10,7 +10,7 @@ processing functionality and a series of new plot types and options. New features ~~~~~~~~~~~~ - - Add :ref:`vectorized string processing methods ` + - Add :ref:`vectorized string processing methods ` accessible via Series.str (:issue:`620`) - Add option to disable adjustment in EWMA (:issue:`1584`) - :ref:`Radviz plot ` (:issue:`1566`) diff --git a/doc/source/v0.9.0.txt b/doc/source/whatsnew/v0.9.0.txt similarity index 97% rename from doc/source/v0.9.0.txt rename to doc/source/whatsnew/v0.9.0.txt index 2b385a7e7b8f0..b60fb9cc64f4a 100644 --- a/doc/source/v0.9.0.txt +++ b/doc/source/whatsnew/v0.9.0.txt @@ -18,7 +18,7 @@ New features ~~~~~~~~~~~~ - Add ``encode`` and ``decode`` for unicode handling to :ref:`vectorized - string processing methods ` in Series.str (:issue:`1706`) + string processing methods ` in Series.str (:issue:`1706`) - Add ``DataFrame.to_latex`` method (:issue:`1735`) - Add convenient expanding window equivalents of all rolling_* ops (:issue:`1785`) - Add Options class to pandas.io.data for fetching options data from Yahoo! diff --git a/doc/source/v0.9.1.txt b/doc/source/whatsnew/v0.9.1.txt similarity index 100% rename from doc/source/v0.9.1.txt rename to doc/source/whatsnew/v0.9.1.txt diff --git a/pandas/__init__.py b/pandas/__init__.py index df5e6f567e3a6..69e8a4bad377e 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -53,11 +53,13 @@ from pandas.io.api import * from pandas.computation.api import * -from pandas.tools.describe import value_range from pandas.tools.merge import merge, concat, ordered_merge from pandas.tools.pivot import pivot_table, crosstab from pandas.tools.plotting import scatter_matrix, plot_params from pandas.tools.tile import cut, qcut +from pandas.tools.util import value_range from pandas.core.reshape import melt from pandas.util.print_versions import show_versions import pandas.util.testing + + diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 1c1d32e1d2a20..316a282b71609 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -712,17 +712,15 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # # - -def _check_minp(win, minp, N): +def _check_minp(win, minp, N, floor=1): if minp > win: raise ValueError('min_periods (%d) must be <= window (%d)' % (minp, win)) elif minp > N: minp = N + 1 - elif minp == 0: - minp = 1 elif minp < 0: raise ValueError('min_periods must be >= 0') - return minp + return max(minp, floor) # original C implementation by N. Devillard. # This code in public domain. @@ -979,7 +977,7 @@ def roll_mean(ndarray[double_t] input, #------------------------------------------------------------------------------- # Exponentially weighted moving average -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na): +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): ''' Compute exponentially-weighted moving average using center-of-mass. @@ -989,45 +987,146 @@ def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na): com : float64 adjust: int ignore_na: int + minp: int Returns ------- y : ndarray ''' - cdef double cur, prev, neww, oldw, adj - cdef Py_ssize_t i cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: return output + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + alpha = 1. / (1. + com) old_wt_factor = 1. - alpha - new_wt = 1.0 if adjust else alpha + new_wt = 1. if adjust else alpha - output[0] = input[0] - weighted_avg = output[0] + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN old_wt = 1. for i from 1 <= i < N: cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) if weighted_avg == weighted_avg: - if cur == cur: - old_wt *= old_wt_factor - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif not ignore_na: + if is_observation or (not ignore_na): old_wt *= old_wt_factor - else: + if is_observation: + if weighted_avg != cur: # avoid numerical errors on constant series + weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: weighted_avg = cur - output[i] = weighted_avg + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving covariance + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + ''' + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + ''' + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + if mean_x != cur_x: # avoid numerical errors on constant series + mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) + if mean_y != cur_y: # avoid numerical errors on constant series + mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN + else: + output[i] = cov + else: + output[i] = NaN return output @@ -1182,7 +1281,7 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): mean_x += delta / nobs ssqdm_x += delta * (val - mean_x) - if nobs >= minp: + if (nobs >= minp) and (nobs > ddof): #pathological case if nobs == 1: val = 0 @@ -1226,7 +1325,7 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): ssqdm_x = 0 # Variance is unchanged if no observation is added or removed - if nobs >= minp: + if (nobs >= minp) and (nobs > ddof): #pathological case if nobs == 1: val = 0 @@ -1287,17 +1386,14 @@ def roll_skew(ndarray[double_t] input, int win, int minp): xxx -= prev * prev * prev nobs -= 1 - if nobs >= minp: A = x / nobs B = xx / nobs - A * A C = xxx / nobs - A * A * A - 3 * A * B - - R = sqrt(B) - - if B == 0 or nobs < 3: + if B <= 0 or nobs < 3: output[i] = NaN else: + R = sqrt(B) output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / ((nobs-2) * R * R * R)) else: @@ -1750,8 +1846,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int win, return output -def roll_generic(ndarray[float64_t, cast=True] input, int win, - int minp, object func, object args, object kwargs): +def roll_generic(ndarray[float64_t, cast=True] input, + int win, int minp, int offset, + object func, object args, object kwargs): cdef ndarray[double_t] output, counts, bufarr cdef Py_ssize_t i, n cdef float64_t *buf @@ -1760,43 +1857,47 @@ def roll_generic(ndarray[float64_t, cast=True] input, int win, if not input.flags.c_contiguous: input = input.copy('C') - buf = input.data - n = len(input) if n == 0: return input - minp = _check_minp(win, minp, n) + minp = _check_minp(win, minp, n, floor=0) output = np.empty(n, dtype=float) - counts = roll_sum(np.isfinite(input).astype(float), win, minp) - - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data + counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] - n = len(input) - for i from 0 <= i < int_min(win, n): + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, n) - offset): if counts[i] >= minp: - output[i] = func(input[int_max(i - win + 1, 0) : i + 1], *args, - **kwargs) + output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) else: output[i] = NaN - for i from win <= i < n: + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (n - offset): buf = buf + 1 bufarr.data = buf if counts[i] >= minp: output[i] = func(bufarr, *args, **kwargs) else: output[i] = NaN - bufarr.data = oldbuf + # truncated windows at the end + for i from int_max(n - offset, 0) <= i < n: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) + else: + output[i] = NaN + return output def roll_window(ndarray[float64_t, ndim=1, cast=True] input, ndarray[float64_t, ndim=1, cast=True] weights, - int minp, bint avg=True, bint avg_wgt=False): + int minp, bint avg=True): """ Assume len(weights) << len(input) """ @@ -1814,7 +1915,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, minp = _check_minp(len(weights), minp, in_n) - if avg_wgt: + if avg: for win_i from 0 <= win_i < win_n: val_win = weights[win_i] if val_win != val_win: @@ -1855,8 +1956,6 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, c = counts[in_i] if c < minp: output[in_i] = NaN - elif avg: - output[in_i] /= c return output diff --git a/pandas/compat/openpyxl_compat.py b/pandas/compat/openpyxl_compat.py index 25ba83d58aaed..266aded2071b6 100644 --- a/pandas/compat/openpyxl_compat.py +++ b/pandas/compat/openpyxl_compat.py @@ -10,15 +10,26 @@ stop_ver = '2.0.0' -def is_compat(): - """Detect whether the installed version of openpyxl is supported. +def is_compat(major_ver=1): + """Detect whether the installed version of openpyxl is supported + Parameters + ---------- + ver : int + 1 requests compatibility status among the 1.x.y series + 2 requests compatibility status of 2.0.0 and later Returns ------- compat : bool - ``True`` if openpyxl is installed and is between versions 1.6.1 and - 2.0.0, ``False`` otherwise. + ``True`` if openpyxl is installed and is a compatible version. + ``False`` otherwise. """ import openpyxl ver = LooseVersion(openpyxl.__version__) - return LooseVersion(start_ver) < ver <= LooseVersion(stop_ver) + if major_ver == 1: + return LooseVersion(start_ver) <= ver < LooseVersion(stop_ver) + elif major_ver == 2: + return LooseVersion(stop_ver) <= ver + else: + raise ValueError('cannot test for openpyxl compatibility with ver {0}' + .format(major_ver)) diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py index bd00dbbb444b6..70541c94b4e8e 100644 --- a/pandas/computation/expressions.py +++ b/pandas/computation/expressions.py @@ -17,7 +17,8 @@ _NUMEXPR_INSTALLED = ver >= LooseVersion('2.1') if not _NUMEXPR_INSTALLED: warnings.warn("The installed version of numexpr {ver} is not supported " - "in pandas and will be not be used".format(ver=ver), UserWarning) + "in pandas and will be not be used\nThe minimum supported " + "version is 2.1\n".format(ver=ver), UserWarning) except ImportError: # pragma: no cover _NUMEXPR_INSTALLED = False diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 81526b88c2b51..9df9975b4b61c 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,10 +1,8 @@ """Operator classes for eval. """ -import re import operator as op from functools import partial -from itertools import product, islice, chain from datetime import datetime import numpy as np @@ -69,7 +67,6 @@ def evaluate(self, *args, **kwargs): return self def _resolve_name(self): - key = self.name res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) @@ -491,3 +488,13 @@ def __call__(self, env): def __unicode__(self): return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) + + @property + def return_type(self): + operand = self.operand + if operand.return_type == np.dtype('bool'): + return np.dtype('bool') + if (isinstance(operand, Op) and + (operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict)): + return np.dtype('bool') + return np.dtype('int') diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 9a1e61ad30386..25d6a7f293dac 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -180,7 +180,7 @@ def stringify(value): v = time.mktime(v.timetuple()) return TermValue(v, pd.Timestamp(v), kind) elif kind == u('timedelta64') or kind == u('timedelta'): - v = _coerce_scalar_to_timedelta_type(v, unit='s').item() + v = _coerce_scalar_to_timedelta_type(v, unit='s').value return TermValue(int(v), v, kind) elif kind == u('integer'): v = int(float(v)) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 56d6ccd0abd9b..0c07dc5c214b4 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -134,9 +134,11 @@ def tearDown(self): @slow def test_complex_cmp_ops(self): - for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, self.cmp_ops, + cmp_ops = ('!=', '==', '<=', '>=', '<', '>') + cmp2_ops = ('>', '<') + for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, cmp_ops, self.rhses, self.bin_ops, - self.cmp2_ops): + cmp2_ops): self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2) def test_simple_cmp_ops(self): @@ -1625,6 +1627,26 @@ def test_inf(): yield check_inf, engine, parser +def check_negate_lt_eq_le(engine, parser): + tm.skip_if_no_ne(engine) + df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count']) + expected = df[~(df.cat > 0)] + + result = df.query('~(cat > 0)', engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + if parser == 'python': + with tm.assertRaises(NotImplementedError): + df.query('not (cat > 0)', engine=engine, parser=parser) + else: + result = df.query('not (cat > 0)', engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + +def test_negate_lt_eq_le(): + for engine, parser in product(_engines, expr._parsers): + yield check_negate_lt_eq_le, engine, parser + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4abb6ed10d6a7..8c4f45fdeb57a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -124,7 +124,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): from pandas.core.index import Index from pandas.core.series import Series vals = np.asarray(values) + is_datetime = com.is_datetime64_dtype(vals) + is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) @@ -161,6 +163,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): if is_datetime: uniques = uniques.astype('M8[ns]') + elif is_timedelta: + uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), tz=getattr(values, 'tz', None)) @@ -196,8 +200,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False, """ from pandas.core.series import Series from pandas.tools.tile import cut + from pandas.tseries.period import PeriodIndex - is_period = getattr(values, 'inferred_type', None) == 'period' + is_period = com.is_period_arraylike(values) values = Series(values).values is_category = com.is_categorical_dtype(values.dtype) @@ -208,13 +213,16 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes elif is_category: - bins = values.levels + bins = values.categories cat = values values = cat.codes dtype = values.dtype if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period: + if is_period: + values = PeriodIndex(values) + values = values.view(np.int64) keys, counts = htable.value_count_int64(values) @@ -240,11 +248,11 @@ def value_counts(values, sort=True, ascending=False, normalize=False, result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.levels)), fill_value=0) + result = result.reindex(np.arange(len(cat.categories)), fill_value=0) if not is_category: result.index = bins[:-1] else: - result.index = cat.levels + result.index = cat.categories if sort: result.sort() @@ -397,7 +405,8 @@ def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) - elif com.is_datetime64_dtype(values): + + elif com.needs_i8_conversion(values): # if we have NaT, punt to object dtype mask = com.isnull(values) diff --git a/pandas/core/api.py b/pandas/core/api.py index b7e02917cd476..a8b10342593ce 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -25,6 +25,7 @@ from pandas.tseries.tools import to_datetime from pandas.tseries.index import (DatetimeIndex, Timestamp, date_range, bdate_range) +from pandas.tseries.tdi import TimedeltaIndex, Timedelta from pandas.tseries.period import Period, PeriodIndex # legacy diff --git a/pandas/core/array.py b/pandas/core/array.py deleted file mode 100644 index 495f231921a19..0000000000000 --- a/pandas/core/array.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Isolate pandas's exposure to NumPy -""" - -import numpy as np - -Array = np.ndarray - -bool = np.bool_ - -_dtypes = { - 'int': [8, 16, 32, 64], - 'uint': [8, 16, 32, 64], - 'float': [16, 32, 64] -} - -_lift_types = [] - -for _k, _v in _dtypes.items(): - for _i in _v: - _lift_types.append(_k + str(_i)) - -for _t in _lift_types: - globals()[_t] = getattr(np, _t) - -_lift_function = ['empty', 'arange', 'array', 'putmask', 'where'] - -for _f in _lift_function: - globals()[_f] = getattr(np, _f) - -_lift_random = ['randn', 'rand'] - -for _f in _lift_random: - globals()[_f] = getattr(np.random, _f) - -NA = np.nan - diff --git a/pandas/core/base.py b/pandas/core/base.py index 348fb4f23cefc..5d6f39e1792c3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -161,7 +161,9 @@ def f(self, *args, **kwargs): else: f = _create_delegator_method(name) - setattr(cls,name,f) + # don't overwrite existing methods/properties + if not hasattr(cls, name): + setattr(cls,name,f) class FrozenList(PandasObject, list): @@ -497,7 +499,7 @@ def searchsorted(self, key, side='left'): @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, take_last=False, inplace=False): duplicated = self.duplicated(take_last=take_last) - result = self[~duplicated.values] + result = self[~(duplicated.values).astype(bool)] if inplace: return self._update_inplace(result) else: @@ -536,217 +538,6 @@ def duplicated(self, take_last=False): #---------------------------------------------------------------------- # abstracts - def _update_inplace(self, result): + def _update_inplace(self, result, **kwargs): raise NotImplementedError - -class DatetimeIndexOpsMixin(object): - """ common ops mixin to support a unified inteface datetimelike Index """ - - def __iter__(self): - return (self._box_func(v) for v in self.asi8) - - @property - def _box_func(self): - """ - box function to get object from internal representation - """ - raise NotImplementedError - - def _box_values(self, values): - """ - apply box func to passed values - """ - return lib.map_infer(values, self._box_func) - - @cache_readonly - def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return (self.asi8 == tslib.iNaT).any() - - @property - def asobject(self): - from pandas.core.index import Index - return Index(self._box_values(self.asi8), name=self.name, dtype=object) - - def tolist(self): - """ - return a list of the underlying data - """ - return list(self.asobject) - - def min(self, axis=None): - """ - return the minimum value of the Index - - See also - -------- - numpy.ndarray.min - """ - try: - i8 = self.asi8 - - # quick check - if len(i8) and self.is_monotonic: - if i8[0] != tslib.iNaT: - return self._box_func(i8[0]) - - if self.hasnans: - mask = i8 == tslib.iNaT - min_stamp = self[~mask].asi8.min() - else: - min_stamp = i8.min() - return self._box_func(min_stamp) - except ValueError: - return self._na_value - - def argmin(self, axis=None): - """ - return a ndarray of the minimum argument indexer - - See also - -------- - numpy.ndarray.argmin - """ - - i8 = self.asi8 - if self.hasnans: - mask = i8 == tslib.iNaT - if mask.all(): - return -1 - i8 = i8.copy() - i8[mask] = np.iinfo('int64').max - return i8.argmin() - - def max(self, axis=None): - """ - return the maximum value of the Index - - See also - -------- - numpy.ndarray.max - """ - try: - i8 = self.asi8 - - # quick check - if len(i8) and self.is_monotonic: - if i8[-1] != tslib.iNaT: - return self._box_func(i8[-1]) - - if self.hasnans: - mask = i8 == tslib.iNaT - max_stamp = self[~mask].asi8.max() - else: - max_stamp = i8.max() - return self._box_func(max_stamp) - except ValueError: - return self._na_value - - def argmax(self, axis=None): - """ - return a ndarray of the maximum argument indexer - - See also - -------- - numpy.ndarray.argmax - """ - - i8 = self.asi8 - if self.hasnans: - mask = i8 == tslib.iNaT - if mask.all(): - return -1 - i8 = i8.copy() - i8[mask] = 0 - return i8.argmax() - - @property - def _formatter_func(self): - """ - Format function to convert value to representation - """ - return str - - def _format_footer(self): - tagline = 'Length: %d, Freq: %s, Timezone: %s' - return tagline % (len(self), self.freqstr, self.tz) - - def __unicode__(self): - formatter = self._formatter_func - summary = str(self.__class__) + '\n' - - n = len(self) - if n == 0: - pass - elif n == 1: - first = formatter(self[0]) - summary += '[%s]\n' % first - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary += '[%s, %s]\n' % (first, last) - else: - first = formatter(self[0]) - last = formatter(self[-1]) - summary += '[%s, ..., %s]\n' % (first, last) - - summary += self._format_footer() - return summary - - @cache_readonly - def _resolution(self): - from pandas.tseries.frequencies import Resolution - return Resolution.get_reso_from_freq(self.freqstr) - - @cache_readonly - def resolution(self): - """ - Returns day, hour, minute, second, millisecond or microsecond - """ - from pandas.tseries.frequencies import get_reso_string - return get_reso_string(self._resolution) - - def __add__(self, other): - from pandas.core.index import Index - from pandas.tseries.offsets import DateOffset - if isinstance(other, Index): - return self.union(other) - elif isinstance(other, (DateOffset, datetime.timedelta, np.timedelta64)): - return self._add_delta(other) - elif com.is_integer(other): - return self.shift(other) - else: # pragma: no cover - return NotImplemented - - def __sub__(self, other): - from pandas.core.index import Index - from pandas.tseries.offsets import DateOffset - if isinstance(other, Index): - return self.diff(other) - elif isinstance(other, (DateOffset, datetime.timedelta, np.timedelta64)): - return self._add_delta(-other) - elif com.is_integer(other): - return self.shift(-other) - else: # pragma: no cover - return NotImplemented - - __iadd__ = __add__ - __isub__ = __sub__ - - def _add_delta(self, other): - return NotImplemented - - def unique(self): - """ - Index.unique with handling for DatetimeIndex/PeriodIndex metadata - - Returns - ------- - result : DatetimeIndex or PeriodIndex - """ - from pandas.core.index import Int64Index - result = Int64Index.unique(self) - return self._simple_new(result, name=self.name, freq=self.freq, - tz=getattr(self, 'tz', None)) - diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9674aea4a715..b35cfdcf7c8f1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2,32 +2,56 @@ import numpy as np from warnings import warn +import types from pandas import compat from pandas.compat import u -from pandas.core.algorithms import factorize, unique -from pandas.core.base import PandasObject +from pandas.core.algorithms import factorize +from pandas.core.base import PandasObject, PandasDelegate from pandas.core.index import Index, _ensure_index from pandas.core.indexing import _is_null_slice from pandas.tseries.period import PeriodIndex import pandas.core.common as com + +from pandas.core.common import isnull from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option from pandas.core import format as fmt def _cat_compare_op(op): def f(self, other): - if isinstance(other, (Categorical, np.ndarray)): - values = np.asarray(self) - f = getattr(values, op) - return f(np.asarray(other)) - else: - if other in self.levels: - i = self.levels.get_loc(other) + # On python2, you can usually compare any type to any type, and Categoricals can be + # seen as a custom type, but having different results depending whether categories are + # the same or not is kind of insane, so be a bit stricter here and use the python3 idea + # of comparing only things of equal type. + if not self.ordered: + if op in ['__lt__', '__gt__','__le__','__ge__']: + raise TypeError("Unordered Categoricals can only compare equality or not") + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the categories are the same + if (len(self.categories) != len(other.categories)) or \ + not ((self.categories == other.categories).all()): + raise TypeError("Categoricals can only be compared if 'categories' are the same") + if not (self.ordered == other.ordered): + raise TypeError("Categoricals can only be compared if 'ordered' is the same") + na_mask = (self._codes == -1) | (other._codes == -1) + f = getattr(self._codes, op) + ret = f(other._codes) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret + elif np.isscalar(other): + if other in self.categories: + i = self.categories.get_loc(other) return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'np.asarray(cat) other'." + raise TypeError(msg.format(op=op,typ=type(other))) f.__name__ = op @@ -43,38 +67,39 @@ def _maybe_to_categorical(array): return array.values return array - -_codes_doc = """The level codes of this categorical. +_codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real -values in the levels array. +values in the categories array. -There is not setter, used the other categorical methods and the item setter on -Categorical to change values in the categorical. +There is not setter, use the other categorical methods and the normal item setter to change +values in the categorical. """ -_levels_doc = """The levels of this categorical. +_categories_doc = """The categories of this categorical. -Setting assigns new values to each level (effectively a rename of -each individual level). +Setting assigns new values to each category (effectively a rename of +each individual category). -The assigned value has to be a list-like object. If the number of -level-items is less than number of level-items in the current level, -all level-items at a higher position are set to NaN. If the number of -level-items is more that the current number of level-items, new -(unused) levels are added at the end. +The assigned value has to be a list-like object. All items must be unique and the number of items +in the new categories must be the same as the number of items in the old categories. -To add level-items in between, use `reorder_levels`. +Assigning to `categories` is a inplace operation! Raises ------ ValueError - If the new levels do not validate as levels + If the new categories do not validate as categories or if the number of new categories is + unequal the number of old categories See also -------- -Categorical.reorder_levels -Categorical.remove_unused_levels +rename_categories +reorder_categories +add_categories +remove_categories +remove_unused_categories +set_categories """ class Categorical(PandasObject): @@ -82,21 +107,21 @@ class Categorical(PandasObject): Represents a categorical variable in classic R / S-plus fashion `Categoricals` can only take on only a limited, and usually fixed, number - of possible values (`levels`). In contrast to statistical categorical + of possible values (`categories`). In contrast to statistical categorical variables, a `Categorical` might have an order, but numerical operations (additions, divisions, ...) are not possible. - All values of the `Categorical` are either in `levels` or `np.nan`. - Assigning values outside of `levels` will raise a `ValueError`. Order is - defined by the order of the `levels`, not lexical order of the values. + All values of the `Categorical` are either in `categories` or `np.nan`. + Assigning values outside of `categories` will raise a `ValueError`. Order is + defined by the order of the `categories`, not lexical order of the values. Parameters ---------- values : list-like - The values of the categorical. If levels are given, values not in levels will + The values of the categorical. If categories are given, values not in categories will be replaced with NaN. - levels : Index-like (unique), optional - The unique levels for this categorical. If not given, the levels are assumed + categories : Index-like (unique), optional + The unique categories for this categorical. If not given, the categories are assumed to be the unique values of values. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, @@ -104,48 +129,37 @@ class Categorical(PandasObject): name : str, optional Name for the Categorical variable. If name is None, will attempt to infer from values. - compat : boolean, default=False - Whether to treat values as codes to the levels (old API, deprecated) Attributes ---------- - levels : ndarray - The levels of this categorical - codes : Index - The codes (integer positions, which point to the levels) of this categorical, read only + categories : Index + The categories of this categorical + codes : ndarray + The codes (integer positions, which point to the categories) of this categorical, read only. ordered : boolean - Whether or not this Categorical is ordered + Whether or not this Categorical is ordered. name : string - The name of this Categorical + The name of this Categorical. Raises ------ ValueError - If the levels do not validate + If the categories do not validate. TypeError - If an explicit ``ordered=True`` is given but no `levels` and the `values` are not sortable + If an explicit ``ordered=True`` is given but no `categories` and the `values` are + not sortable. Examples -------- >>> from pandas import Categorical >>> Categorical([1, 2, 3, 1, 2, 3]) - 1 - 2 - 3 - 1 - 2 - 3 - Levels (3): Int64Index([1, 2, 3], dtype=int64), ordered + [1, 2, 3, 1, 2, 3] + Categories (3, int64): [1 < 2 < 3] >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) - a - b - c - a - b - c - Levels (3): Index(['a', 'b', 'c'], dtype=object), ordered + [a, b, c, a, b, c] + Categories (3, object): [a < b < c] >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a']) >>> a.min() @@ -161,7 +175,7 @@ class Categorical(PandasObject): """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order - of the levels) and have a min and max value. + of the categories) and have a min and max value. See also -------- @@ -171,19 +185,35 @@ class Categorical(PandasObject): Categorical.max """ - def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): + # For comparisons, so that numpy uses our implementation if the compare ops, which raise + __array_priority__ = 1000 + ordered = False + name = None + + def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, + levels=None): if fastpath: # fast path - self._codes = values + self._codes = _coerce_codes_dtype(values, categories) self.name = name - self.levels = levels + self.categories = categories self.ordered = ordered return if name is None: name = getattr(values, 'name', None) + # TODO: Remove after deprecation period in 2017/ after 0.18 + if not levels is None: + warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", + FutureWarning) + if categories is None: + categories = levels + else: + raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " + "use only 'categories'") + # sanitize input if com.is_categorical_dtype(values): @@ -191,8 +221,8 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, cat = values if isinstance(values, com.ABCSeries): cat = values.values - if levels is None: - levels = cat.levels + if categories is None: + categories = cat.categories if ordered is None: ordered = cat.ordered values = values.__array__() @@ -206,61 +236,63 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well - inferred = com._possibly_infer_to_datetimelike(values) - if not isinstance(inferred, np.ndarray): + values = com._possibly_infer_to_datetimelike(values, convert_dates=True) + if not isinstance(values, np.ndarray): + values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array - values = _sanitize_array(values, None) + # On list with NaNs, int values will be converted to float. Use "object" dtype + # to prevent this. In the end objects will be casted to int/... in the category + # assignment step. + dtype = 'object' if isnull(values).any() else None + values = _sanitize_array(values, None, dtype=dtype) - if levels is None: + if categories is None: try: - codes, levels = factorize(values, sort=True) + codes, categories = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: - codes, levels = factorize(values, sort=False) + codes, categories = factorize(values, sort=False) if ordered: - # raise, as we don't have a sortable data structure and so the usershould - # give us one by specifying levels - raise TypeError("'values' is not ordered, please explicitly specify the level " - "order by passing in a level argument.") + # raise, as we don't have a sortable data structure and so the user should + # give us one by specifying categories + raise TypeError("'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument.") else: - # there are two ways if levels are present - # the old one, where each value is a int pointer to the levels array - # the new one, where each value is also in the level array (or np.nan) + # there were two ways if categories are present + # - the old one, where each value is a int pointer to the levels array -> not anymore + # possible, but code outside of pandas could call us like that, so make some checks + # - the new one, where each value is also in the categories array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in - levels = self._validate_levels(levels) - - # There can be two ways: the old which passed in codes and levels directly - # and values have to be inferred and the new one, which passes in values and levels - # and _codes have to be inferred. - - # min and max can be higher and lower if not all levels are in the values - if compat and (com.is_integer_dtype(values) and - (np.min(values) >= -1) and (np.max(values) < len(levels))): - warn("Using 'values' as codes is deprecated.\n" - "'Categorical(... , compat=True)' is only there for historical reasons and " - "should not be used in new code!\n" - "See https://github.com/pydata/pandas/pull/7217", FutureWarning) - codes = values - else: - codes = _get_codes_for_values(values, levels) + categories = self._validate_categories(categories) - # if we got levels, we can assume that the order is intended - # if ordered is unspecified - if ordered is None: - ordered = True + codes = _get_codes_for_values(values, categories) + + # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 + if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): + warn("Values and categories have different dtypes. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + + if com.is_integer_dtype(values) and (codes == -1).all(): + warn("None of the categories were found in values. Did you mean to use\n" + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + + # if we got categories, we can assume that the order is intended + # if ordered is unspecified + if ordered is None: + ordered = True self.ordered = False if ordered is None else ordered - self._codes = codes - self.levels = levels + self.categories = categories self.name = name + self._codes = _coerce_codes_dtype(codes, categories) def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(),levels=self.levels, + return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) @classmethod @@ -268,20 +300,22 @@ def from_array(cls, data): """ Make a Categorical type from a single array-like object. + For internal compatibility with numpy arrays. + Parameters ---------- data : array-like - Can be an Index or array-like. The levels are assumed to be + Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ return Categorical(data) @classmethod - def from_codes(cls, codes, levels, ordered=True, name=None): + def from_codes(cls, codes, categories, ordered=False, name=None): """ - Make a Categorical type from codes and levels arrays. + Make a Categorical type from codes and categories arrays. - This constructor is useful if you already have codes and levels and so do not need the + This constructor is useful if you already have codes and categories and so do not need the (computation intensive) factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. @@ -289,12 +323,12 @@ def from_codes(cls, codes, levels, ordered=True, name=None): Parameters ---------- codes : array-like, integers - An integer array, where each integer points to a level in levels or -1 for NaN - levels : index-like - The levels for the categorical. Items need to be unique. + An integer array, where each integer points to a category in categories or -1 for NaN + categories : index-like + The categories for the categorical. Items need to be unique. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be ordered. + the resulting categorical will be unordered. name : str, optional Name for the Categorical variable. """ @@ -303,18 +337,18 @@ def from_codes(cls, codes, levels, ordered=True, name=None): except: raise ValueError("codes need to be convertible to an arrays of integers") - levels = cls._validate_levels(levels) + categories = cls._validate_categories(categories) - if codes.max() >= len(levels) or codes.min() < -1: - raise ValueError("codes need to be between -1 and len(levels)-1") + if codes.max() >= len(categories) or codes.min() < -1: + raise ValueError("codes need to be between -1 and len(categories)-1") - return Categorical(codes, levels=levels, ordered=ordered, name=name, fastpath=True) + return Categorical(codes, categories=categories, ordered=ordered, name=name, fastpath=True) _codes = None def _get_codes(self): - """ Get the level codes. + """ Get the codes. Returns ------- @@ -333,73 +367,322 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) - _levels = None + def _get_labels(self): + """ Get the category labels (deprecated). + + Deprecated, use .codes! + """ + import warnings + warnings.warn("'labels' is deprecated. Use 'codes' instead", FutureWarning) + return self.codes + + labels = property(fget=_get_labels, fset=_set_codes) + + _categories = None @classmethod - def _validate_levels(cls, levels): - """" Validates that we have good levels """ - levels = _ensure_index(levels) - if not levels.is_unique: - raise ValueError('Categorical levels must be unique') - return levels + def _validate_categories(cls, categories): + """" Validates that we have good categories """ + if not isinstance(categories, Index): + dtype = None + if not hasattr(categories, "dtype"): + categories = _convert_to_list_like(categories) + # on categories with NaNs, int values would be converted to float. + # Use "object" dtype to prevent this. + if isnull(categories).any(): + without_na = np.array([x for x in categories if com.notnull(x)]) + with_na = np.array(categories) + if with_na.dtype != without_na.dtype: + dtype = "object" + categories = Index(categories, dtype=dtype) + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + return categories + + def _set_categories(self, categories): + """ Sets new categories """ + categories = self._validate_categories(categories) + if not self._categories is None and len(categories) != len(self._categories): + raise ValueError("new categories need to have the same number of items than the old " + "categories!") + self._categories = categories + + def _get_categories(self): + """ Gets the categories """ + # categories is an Index, which is immutable -> no need to copy + return self._categories + + categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) def _set_levels(self, levels): - """ Sets new levels """ - levels = self._validate_levels(levels) - - if not self._levels is None and len(levels) < len(self._levels): - # remove all _codes which are larger - self._codes[self._codes >= len(levels)] = -1 - self._levels = levels + """ set new levels (deprecated, use "categories") """ + warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning) + self.categories = levels def _get_levels(self): - """ Gets the levels """ - # levels is an Index, which is immutable -> no need to copy - return self._levels + """ Gets the levels (deprecated, use "categories") """ + warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning) + return self.categories + + # TODO: Remove after deprecation period in 2017/ after 0.18 + levels = property(fget=_get_levels, fset=_set_levels) - levels = property(fget=_get_levels, fset=_set_levels, doc=_levels_doc) + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + """ Sets the categories to the specified new_categories. - def reorder_levels(self, new_levels, ordered=None): - """ Reorders levels as specified in new_levels. + `new_categories` can include new categories (which will result in unused categories) or + or remove old categories (which results in values set to NaN). If `rename==True`, + the categories will simple be renamed (less or more items than in old categories will + result in values set to NaN or in unused categories respectively). - `new_levels` must include all old levels but can also include new level items. In - contrast to assigning to `levels`, these new level items can be in arbitrary positions. + This method can be used to perform more than one action of adding, removing, + and reordering simultaneously and is therefore faster than performing the individual steps + via the more specialised methods. - The level reordering is done inplace. + On the other hand this methods does not do checks (e.g., whether the old categories are + included in the new categories on a reorder), which can result in surprising changes, for + example when using special string dtypes on python3, which does not considers a S1 string + equal to a single char python string. Raises ------ ValueError - If the new levels do not contain all old level items + If new_categories does not validate as categories Parameters ---------- - new_levels : Index-like - The levels in new order. must be of same length as the old levels + new_categories : Index-like + The categories in new order. ordered : boolean, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. + rename : boolean (default: False) + Whether or not the new_categories should be considered as a rename of the old + categories or as reordered categories. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of this categorical + with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories """ - new_levels = self._validate_levels(new_levels) + new_categories = self._validate_categories(new_categories) + cat = self if inplace else self.copy() + if rename: + if not cat._categories is None and len(new_categories) < len(cat._categories): + # remove all _codes which are larger and set to -1/NaN + self._codes[self._codes >= len(new_categories)] = -1 + cat._categories = new_categories + else: + values = cat.__array__() + cat._codes = _get_codes_for_values(values, new_categories) + cat._categories = new_categories - if len(new_levels) < len(self._levels) or len(self._levels-new_levels): - raise ValueError('Reordered levels must include all original levels') - values = self.__array__() - self._codes = _get_codes_for_values(values, new_levels) - self._levels = new_levels if not ordered is None: - self.ordered = ordered + cat.ordered = ordered + + if not inplace: + return cat + + def rename_categories(self, new_categories, inplace=False): + """ Renames categories. + + The new categories has to be a list-like object. All items must be unique and the number of + items in the new categories must be the same as the number of items in the old categories. + + Raises + ------ + ValueError + If the new categories do not have the same number of items than the current categories + or do not validate as categories + + Parameters + ---------- + new_categories : Index-like + The renamed categories. + inplace : boolean (default: False) + Whether or not to rename the categories inplace or return a copy of this categorical + with renamed categories. + + Returns + ------- + cat : Categorical with renamed categories added or None if inplace. + + See also + -------- + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + cat = self if inplace else self.copy() + cat.categories = new_categories + if not inplace: + return cat + + def reorder_categories(self, new_categories, ordered=None, inplace=False): + """ Reorders categories as specified in new_categories. - def remove_unused_levels(self): - """ Removes levels which are not used. + `new_categories` need to include all old categories and no new category items. - The level removal is done inplace. + Raises + ------ + ValueError + If the new categories do not contain all old category items or any new ones + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : boolean, optional + Whether or not the categorical is treated as a ordered categorical. If not given, + do not change the ordered information. + inplace : boolean (default: False) + Whether or not to reorder the categories inplace or return a copy of this categorical + with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + See also + -------- + rename_categories + add_categories + remove_categories + remove_unused_categories + set_categories """ - _used = sorted(np.unique(self._codes)) - new_levels = self.levels.take(_used) - new_levels = _ensure_index(new_levels) - self._codes = _get_codes_for_values(self.__array__(), new_levels) - self._levels = new_levels + if set(self._categories) != set(new_categories): + raise ValueError("items in new_categories are not the same as in old categories") + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + + def add_categories(self, new_categories, inplace=False): + """ Add new categories. + + `new_categories` will be included at the last/highest place in the categories and will be + unused directly after this call. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as categories + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : boolean (default: False) + Whether or not to add the categories inplace or return a copy of this categorical + with added categories. + + Returns + ------- + cat : Categorical with new categories added or None if inplace. + + See also + -------- + rename_categories + reorder_categories + remove_categories + remove_unused_categories + set_categories + """ + if not com.is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self._categories) + if len(already_included) != 0: + msg = "new categories must not include old categories: %s" % str(already_included) + raise ValueError(msg) + new_categories = list(self._categories) + (new_categories) + new_categories = self._validate_categories(new_categories) + cat = self if inplace else self.copy() + cat._categories = new_categories + cat._codes = _coerce_codes_dtype(cat._codes, new_categories) + if not inplace: + return cat + + def remove_categories(self, removals, inplace=False): + """ Removes the specified categories. + + `removals` must be included in the old categories. Values which were in the removed + categories will be set to NaN + + Raises + ------ + ValueError + If the removals are not contained in the categories + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + inplace : boolean (default: False) + Whether or not to remove the categories inplace or return a copy of this categorical + with removed categories. + + Returns + ------- + cat : Categorical with removed categories or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_unused_categories + set_categories + """ + if not com.is_list_like(removals): + removals = [removals] + removals = set(list(removals)) + not_included = removals - set(self._categories) + if len(not_included) != 0: + raise ValueError("removals must all be in old categories: %s" % str(not_included)) + new_categories = [ c for c in self._categories if c not in removals ] + return self.set_categories(new_categories, ordered=self.ordered, rename=False, + inplace=inplace) + + + def remove_unused_categories(self, inplace=False): + """ Removes categories which are not used. + + Parameters + ---------- + inplace : boolean (default: False) + Whether or not to drop unused categories inplace or return a copy of this categorical + with unused categories dropped. + + Returns + ------- + cat : Categorical with unused categories dropped or None if inplace. + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + set_categories + """ + cat = self if inplace else self.copy() + _used = sorted(np.unique(cat._codes)) + new_categories = cat.categories.take(com._ensure_platform_int(_used)) + new_categories = _ensure_index(new_categories) + cat._codes = _get_codes_for_values(cat.__array__(), new_categories) + cat._categories = new_categories + if not inplace: + return cat __eq__ = _cat_compare_op('__eq__') @@ -429,14 +712,83 @@ def __array__(self, dtype=None): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype + A numpy array of either the specified dtype or, if dtype==None (default), the same + dtype as categorical.categories.dtype """ - return com.take_1d(self.levels.values, self._codes) + ret = com.take_1d(self.categories.values, self._codes) + if dtype and dtype != self.categories.dtype: + return np.asarray(ret, dtype) + return ret + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if not isinstance(state, dict): + raise Exception('invalid pickle state') + + # Provide compatibility with pre-0.15.0 Categoricals. + if '_codes' not in state and 'labels' in state: + state['_codes'] = state.pop('labels') + if '_categories' not in state and '_levels' in state: + state['_categories'] = \ + self._validate_categories(state.pop('_levels')) + + for k, v in compat.iteritems(state): + setattr(self, k, v) @property def T(self): return self + @property + def nbytes(self): + return self._codes.nbytes + self._categories.values.nbytes + + def searchsorted(self, v, side='left', sorter=None): + raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420") + + def isnull(self): + """ + Detect missing values + + Both missing values (-1 in .codes) and NA as a category are detected. + + Returns + ------- + a boolean array of whether my values are null + + See also + -------- + pandas.isnull : pandas version + Categorical.notnull : boolean inverse of Categorical.isnull + """ + + ret = self._codes == -1 + + # String/object and float categories can hold np.nan + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: + nan_pos = np.where(isnull(self.categories))[0] + # we only have one NA in categories + ret = np.logical_or(ret , self._codes == nan_pos) + return ret + + def notnull(self): + """ + Reverse of isnull + + Both missing values (-1 in .codes) and NA as a category are detected as null. + + Returns + ------- + a boolean array of whether my values are not null + + See also + -------- + pandas.notnull : pandas version + Categorical.isnull : boolean inverse of Categorical.notnull + """ + return ~self.isnull() + def get_values(self): """ Return the values. @@ -445,12 +797,13 @@ def get_values(self): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype or dtype string if periods + A numpy array of the same dtype as categorical.categories.dtype or dtype string if + periods """ # if we are a period index, return a string repr - if isinstance(self.levels, PeriodIndex): - return com.take_1d(np.array(self.levels.to_native_types(), dtype=object), + if isinstance(self.categories, PeriodIndex): + return com.take_1d(np.array(self.categories.to_native_types(), dtype=object), self._codes) return np.array(self) @@ -474,7 +827,7 @@ def argsort(self, ascending=True, **kwargs): return result def order(self, inplace=False, ascending=True, na_position='last', **kwargs): - """ Sorts the Category by level value returning a new Categorical by default. + """ Sorts the Category by category value returning a new Categorical by default. Only ordered Categoricals can be sorted! @@ -503,20 +856,37 @@ def order(self, inplace=False, ascending=True, na_position='last', **kwargs): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - codes = np.sort(self._codes.copy()) + codes = np.sort(self._codes) if not ascending: codes = codes[::-1] + # NaN handling + na_mask = (codes==-1) + if na_mask.any(): + n_nans = len(codes[na_mask]) + if na_position=="first" and not ascending: + # in this case sort to the front + new_codes = codes.copy() + new_codes[0:n_nans] = -1 + new_codes[n_nans:] = codes[~na_mask] + codes = new_codes + elif na_position=="last" and not ascending: + # ... and to the end + new_codes = codes.copy() + pos = len(codes)-n_nans + new_codes[0:pos] = codes[~na_mask] + new_codes[pos:] = -1 + codes = new_codes if inplace: self._codes = codes return else: - return Categorical(values=codes,levels=self.levels, ordered=self.ordered, + return Categorical(values=codes,categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def sort(self, inplace=True, ascending=True, na_position='last', **kwargs): - """ Sorts the Category inplace by level value. + """ Sorts the Category inplace by category value. Only ordered Categoricals can be sorted! @@ -566,7 +936,14 @@ def view(self): return self def to_dense(self): - """ Return my 'dense' repr """ + """Return my 'dense' representation + + For internal compatibility with numpy arrays. + + Returns + ------- + dense : array + """ return np.asarray(self) def fillna(self, fill_value=None, method=None, limit=None, **kwargs): @@ -595,41 +972,58 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): values = self._codes + # Make sure that we also get NA in categories + if self.categories.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.categories: + values = values.copy() + nan_pos = np.where(isnull(self.categories))[0] + # we only have one NA in categories + values[values == nan_pos] = -1 + + # pad / bfill if method is not None: values = self.to_dense().reshape(-1,len(self)) values = com.interpolate_2d( - values, method, 0, None, fill_value).astype(self.levels.dtype)[0] - values = _get_codes_for_values(values, self.levels) + values, method, 0, None, fill_value).astype(self.categories.dtype)[0] + values = _get_codes_for_values(values, self.categories) else: - if not com.isnull(fill_value) and fill_value not in self.levels: - raise ValueError("fill value must be in levels") + if not com.isnull(fill_value) and fill_value not in self.categories: + raise ValueError("fill value must be in categories") - mask = self._codes==-1 + mask = values==-1 if mask.any(): - values = self._codes.copy() - values[mask] = self.levels.get_loc(fill_value) + values = values.copy() + values[mask] = self.categories.get_loc(fill_value) - return Categorical(values, levels=self.levels, ordered=self.ordered, + return Categorical(values, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): - """ Take the values by the indexer, fill with the fill_value. """ - if allow_fill and fill_value is None: - fill_value = np.nan + """ Take the codes by the indexer, fill with the fill_value. + + For internal compatibility with numpy arrays. + """ + + # filling must always be None/nan here + # but is passed thru internally + assert isnull(fill_value) - values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = Categorical(values=values, levels=self.levels, ordered=self.ordered, + codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) + result = Categorical(codes, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) return result take = take_nd def _slice(self, slicer): - """ Return a slice of myself. """ + """ Return a slice of myself. + + For internal compatibility with numpy arrays. + """ # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) @@ -639,42 +1033,44 @@ def _slice(self, slicer): slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes,levels=self.levels, ordered=self.ordered, + return Categorical(values=_codes,categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True) def __len__(self): + """The length of this Categorical.""" return len(self._codes) def __iter__(self): + """Returns an Iterator over the values of this Categorical.""" return iter(np.array(self)) - def _tidy_repr(self, max_vals=20): + def _tidy_repr(self, max_vals=10): num = max_vals // 2 head = self[:num]._get_repr(length=False, name=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, name=False, footer=False) - result = '%s\n...\n%s' % (head, tail) + result = '%s, ..., %s' % (head[:-1], tail[1:]) result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) - def _repr_level_info(self): + def _repr_categories_info(self): """ Returns a string representation of the footer.""" - max_levels = (10 if get_option("display.max_levels") == 0 - else get_option("display.max_levels")) - level_strs = fmt.format_array(self.levels.get_values(), None) - if len(level_strs) > max_levels: - num = max_levels // 2 - head = level_strs[:num] - tail = level_strs[-(max_levels - num):] - level_strs = head + ["..."] + tail + max_categories = (10 if get_option("display.max_categories") == 0 + else get_option("display.max_categories")) + category_strs = fmt.format_array(self.categories.get_values(), None) + if len(category_strs) > max_categories: + num = max_categories // 2 + head = category_strs[:num] + tail = category_strs[-(max_categories - num):] + category_strs = head + ["..."] + tail # Strip all leading spaces, which format_array adds for columns... - level_strs = [x.strip() for x in level_strs] - levheader = "Levels (%d, %s): " % (len(self.levels), - self.levels.dtype) + category_strs = [x.strip() for x in category_strs] + levheader = "Categories (%d, %s): " % (len(self.categories), + self.categories.dtype) width, height = get_terminal_size() max_width = (width if get_option("display.width") == 0 else get_option("display.width")) @@ -685,7 +1081,7 @@ def _repr_level_info(self): start = True cur_col_len = len(levheader) sep_len, sep = (3, " < ") if self.ordered else (2, ", ") - for val in level_strs: + for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += "\n" + (" "* len(levheader)) cur_col_len = len(levheader) @@ -701,7 +1097,7 @@ def _repr_footer(self): namestr = "Name: %s, " % self.name if self.name is not None else "" return u('%sLength: %d\n%s') % (namestr, - len(self), self._repr_level_info()) + len(self), self._repr_categories_info()) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): formatter = fmt.CategoricalFormatter(self, name=name, @@ -712,23 +1108,26 @@ def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): def __unicode__(self): """ Unicode representation. """ - width, height = get_terminal_size() - max_rows = (height if get_option("display.max_rows") == 0 - else get_option("display.max_rows")) - - if len(self._codes) > (max_rows or 1000): - result = self._tidy_repr(min(30, max_rows) - 4) + _maxlen = 10 + if len(self._codes) > _maxlen: + result = self._tidy_repr(_maxlen) elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > 50, + result = self._get_repr(length=len(self) > _maxlen, name=True) else: - result = 'Categorical([], %s' % self._get_repr(name=True, + result = '[], %s' % self._get_repr(name=True, length=False, footer=True, ).replace("\n",", ") return result + def _maybe_coerce_indexer(self, indexer): + """ return an indexer coerced to the codes dtype """ + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': + indexer = indexer.astype(self._codes.dtype) + return indexer + def __getitem__(self, key): """ Return an item. """ if isinstance(key, (int, np.integer)): @@ -736,9 +1135,10 @@ def __getitem__(self, key): if i == -1: return np.nan else: - return self.levels[i] + return self.categories[i] else: - return Categorical(values=self._codes[key], levels=self.levels, + key = self._maybe_coerce_indexer(key) + return Categorical(values=self._codes[key], categories=self.categories, ordered=self.ordered, fastpath=True) def __setitem__(self, key, value): @@ -748,29 +1148,30 @@ def __setitem__(self, key, value): Raises ------ ValueError - If (one or more) Value is not in levels or if a assigned `Categorical` has not the - same levels + If (one or more) Value is not in categories or if a assigned `Categorical` has not the + same categories """ - # require identical level set + # require identical categories set if isinstance(value, Categorical): - if not value.levels.equals(self.levels): - raise ValueError("cannot set a Categorical with another, without identical levels") + if not value.categories.equals(self.categories): + raise ValueError("Cannot set a Categorical with another, without identical " + "categories") rvalue = value if com.is_list_like(value) else [value] - to_add = Index(rvalue)-self.levels - if len(to_add): - raise ValueError("cannot setitem on a Categorical with a new level," - " set the levels first") + to_add = Index(rvalue).difference(self.categories) + # no assignments of values not in categories, but it's always ok to set something to np.nan + if len(to_add) and not isnull(to_add).all(): + raise ValueError("cannot setitem on a Categorical with a new category," + " set the categories first") # set by position if isinstance(key, (int, np.integer)): pass - # tuple of indexers + # tuple of indexers (dataframe) elif isinstance(key, tuple): - # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) if len(key) == 2: @@ -782,10 +1183,30 @@ def __setitem__(self, key, value): else: raise AssertionError("invalid slicing for a 1-ndim categorical") - else: - key = self._codes[key] + # slicing in Series or Categorical + elif isinstance(key, slice): + pass - lindexer = self.levels.get_indexer(rvalue) + # Array of True/False in Series or Categorical + else: + # There is a bug in numpy, which does not accept a Series as a indexer + # https://github.com/pydata/pandas/issues/6168 + # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 + # FIXME: remove when numpy 1.9 is the lowest numpy version pandas accepts... + key = np.asarray(key) + + lindexer = self.categories.get_indexer(rvalue) + + # FIXME: the following can be removed after https://github.com/pydata/pandas/issues/7820 + # is fixed. + # float categories do currently return -1 for np.nan, even if np.nan is included in the + # index -> "repair" this here + if isnull(rvalue).any() and isnull(self.categories).any(): + nan_pos = np.where(com.isnull(self.categories))[0] + lindexer[lindexer == -1] = nan_pos + + key = self._maybe_coerce_indexer(key) + lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer #### reduction ops #### @@ -821,7 +1242,7 @@ def min(self, numeric_only=None, **kwargs): if pointer == -1: return np.nan else: - return self.levels[pointer] + return self.categories[pointer] def max(self, numeric_only=None, **kwargs): @@ -848,7 +1269,7 @@ def max(self, numeric_only=None, **kwargs): if pointer == -1: return np.nan else: - return self.levels[pointer] + return self.categories[pointer] def mode(self): """ @@ -865,7 +1286,7 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), - levels=self.levels,ordered=self.ordered, name=self.name, + categories=self.categories,ordered=self.ordered, name=self.name, fastpath=True) return result @@ -873,13 +1294,13 @@ def unique(self): """ Return the unique values. - This includes all levels, even if one or more is unused. + This includes all categories, even if one or more is unused. Returns ------- unique values : array """ - return self.levels + return np.asarray(self.categories) def equals(self, other): """ @@ -898,7 +1319,7 @@ def equals(self, other): if not isinstance(other, Categorical): return False # TODO: should this also test if name is equal? - return (self.levels.equals(other.levels) and self.ordered == other.ordered and + return (self.categories.equals(other.categories) and self.ordered == other.ordered and np.array_equal(self._codes, other._codes)) def describe(self): @@ -907,7 +1328,7 @@ def describe(self): Returns ------- description: `DataFrame` - A dataframe with frequency and counts by level. + A dataframe with frequency and counts by category. """ # Hack? from pandas.core.frame import DataFrame @@ -916,29 +1337,131 @@ def describe(self): 'values' : self._codes } ).groupby('codes').count() - counts.index = self.levels.take(counts.index) - counts = counts.reindex(self.levels) freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts,freqs],axis=1) - result.index.name = 'levels' result.columns = ['counts','freqs'] + + # fill in the real categories + check = result.index == -1 + if check.any(): + # Sort -1 (=NaN) to the last position + index = np.arange(0, len(self.categories)+1, dtype='int64') + index[-1] = -1 + result = result.reindex(index) + # build new index + categories = np.arange(0,len(self.categories)+1 ,dtype=object) + categories[:-1] = self.categories + categories[-1] = np.nan + result.index = categories.take(com._ensure_platform_int(result.index)) + else: + result.index = self.categories.take(com._ensure_platform_int(result.index)) + result = result.reindex(self.categories) + result.index.name = 'categories' + return result +##### The Series.cat accessor ##### + +class CategoricalAccessor(PandasDelegate): + """ + Accessor object for categorical properties of the Series values. + + Be aware that assigning to `categories` is a inplace operation, while all methods return + new categorical data per default (but can be called with `inplace=True`). + + Examples + -------- + >>> s.cat.categories + >>> s.cat.categories = list('abc') + >>> s.cat.rename_categories(list('cab')) + >>> s.cat.reorder_categories(list('cab')) + >>> s.cat.add_categories(['d','e']) + >>> s.cat.remove_categories(['d']) + >>> s.cat.remove_unused_categories() + >>> s.cat.set_categories(list('abcde')) + + """ + + def __init__(self, values, index): + self.categorical = values + self.index = index + + def _delegate_property_get(self, name): + return getattr(self.categorical, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self.categorical, name, new_values) + + @property + def codes(self): + from pandas import Series + return Series(self.categorical.codes, index=self.index) + + def _delegate_method(self, name, *args, **kwargs): + from pandas import Series + method = getattr(self.categorical, name) + res = method(*args, **kwargs) + if not res is None: + return Series(res, index=self.index) + +# TODO: remove levels after the deprecation period +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, + accessors=["categories", "ordered"], + typ='property') +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, + accessors=["rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories"], + typ='method') + ##### utility routines ##### -def _get_codes_for_values(values, levels): +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max + +def _coerce_codes_dtype(codes, categories): + """ coerce the code input array to an appropriate dtype """ + codes = np.array(codes,copy=False) + l = len(categories) + if l < _int8_max: + return codes.astype('int8') + elif l < _int16_max: + return codes.astype('int16') + elif l < _int32_max: + return codes.astype('int32') + return codes.astype('int64') + +def _get_codes_for_values(values, categories): """" - utility routine to turn values into codes given the specified levels + utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != levels.dtype: + if values.dtype != categories.dtype: values = com._ensure_object(values) - levels = com._ensure_object(levels) + categories = com._ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(levels)) - t.map_locations(com._values_from_object(levels)) - return com._ensure_platform_int(t.lookup(values)) + t = hash_klass(len(categories)) + t.map_locations(com._values_from_object(categories)) + return _coerce_codes_dtype(t.lookup(values), categories) + +def _convert_to_list_like(list_like): + if hasattr(list_like, "dtype"): + return list_like + if isinstance(list_like, list): + return list_like + if (com._is_sequence(list_like) or isinstance(list_like, tuple) + or isinstance(list_like, types.GeneratorType)): + return list(list_like) + elif np.isscalar(list_like): + return [list_like] + else: + # is this reached? + return [list_like] diff --git a/pandas/core/common.py b/pandas/core/common.py index 48fb75f59ac34..31dc58d1870e0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -22,7 +22,6 @@ from pandas.compat import StringIO, BytesIO, range, long, u, zip, map from pandas.core.config import get_option -from pandas.core import array as pa class PandasError(Exception): pass @@ -64,6 +63,13 @@ def _check(cls, inst): return meta(name, tuple(), dct) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) +ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) @@ -276,15 +282,22 @@ def _isnull_ndarraylike(obj): dtype = values.dtype if dtype.kind in ('O', 'S', 'U'): - # Working around NumPy ticket 1542 - shape = values.shape - - if dtype.kind in ('S', 'U'): - result = np.zeros(values.shape, dtype=bool) + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) + + # Working around NumPy ticket 1542 + shape = values.shape + + if dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) elif dtype in _DATELIKE_DTYPES: # this is the NaT pattern @@ -299,7 +312,6 @@ def _isnull_ndarraylike(obj): return result - def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype @@ -368,7 +380,7 @@ def _is_null_datelike_scalar(other): return isnull(other) return False -def array_equivalent(left, right): +def array_equivalent(left, right, strict_nan=False): """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right @@ -379,6 +391,8 @@ def array_equivalent(left, right): Parameters ---------- left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. Returns ------- @@ -392,13 +406,35 @@ def array_equivalent(left, right): >>> array_equivalent(np.array([1, nan, 2]), np.array([1, 2, nan])) False """ + left, right = np.asarray(left), np.asarray(right) if left.shape != right.shape: return False - # NaNs occur only in object arrays, float or complex arrays. - if issubclass(left.dtype.type, np.object_): - return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() + + # Object arrays can contain None, NaN and NaT. + if issubclass(left.dtype.type, np.object_) or issubclass(right.dtype.type, np.object_): + + if not strict_nan: + # pd.isnull considers NaN and None to be equivalent. + return lib.array_equivalent_object(_ensure_object(left.ravel()), + _ensure_object(right.ravel())) + + for left_value, right_value in zip(left, right): + if left_value is tslib.NaT and right_value is not tslib.NaT: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + if left_value != right_value: + return False + return True + + # NaNs can occur in float and complex arrays. if issubclass(left.dtype.type, (np.floating, np.complexfloating)): return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + # NaNs cannot occur otherwise. return np.array_equal(left, right) def _iterable_not_string(x): @@ -850,7 +886,6 @@ def func(arr, indexer, out, fill_value=np.nan): func(arr, indexer, out=out, fill_value=fill_value) return out - _diff_special = { 'float64': algos.diff_2d_float64, 'float32': algos.diff_2d_float32, @@ -860,24 +895,25 @@ def func(arr, indexer, out, fill_value=np.nan): 'int8': algos.diff_2d_int8, } - def diff(arr, n, axis=0): """ difference of n between self, analagoust to s-s.shift(n) """ n = int(n) - dtype = arr.dtype na = np.nan - - if is_timedelta64_dtype(arr) or is_datetime64_dtype(arr): - dtype = 'timedelta64[ns]' + dtype = arr.dtype + is_timedelta = False + if needs_i8_conversion(arr): + dtype = np.float64 arr = arr.view('i8') na = tslib.iNaT + is_timedelta = True elif issubclass(dtype.type, np.integer): dtype = np.float64 elif issubclass(dtype.type, np.bool_): dtype = np.object_ + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) na_indexer = [slice(None)] * arr.ndim @@ -898,7 +934,7 @@ def diff(arr, n, axis=0): # need to make sure that we account for na for datelike/timedelta # we don't actually want to subtract these i8 numbers - if dtype == 'timedelta64[ns]': + if is_timedelta: res = arr[res_indexer] lag = arr[lag_indexer] @@ -915,6 +951,10 @@ def diff(arr, n, axis=0): else: out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] + if is_timedelta: + from pandas import TimedeltaIndex + out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(out_arr.shape).astype('timedelta64[ns]') + return out_arr @@ -959,7 +999,7 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ # a 1-element ndarray - if isinstance(val, pa.Array): + if isinstance(val, np.ndarray): if val.ndim != 0: raise ValueError( "invalid ndarray passed to _infer_dtype_from_scalar") @@ -1310,7 +1350,7 @@ def _fill_zeros(result, x, y, name, fill): if not isinstance(y, np.ndarray): dtype, value = _infer_dtype_from_scalar(y) - y = pa.empty(result.shape, dtype=dtype) + y = np.empty(result.shape, dtype=dtype) y.fill(value) if is_integer_dtype(y): @@ -1505,6 +1545,8 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, def _interp_limit(invalid, limit): """mask off values that won't be filled since they exceed the limit""" all_nans = np.where(invalid)[0] + if all_nans.size == 0: # no nans anyway + return [] violate = [invalid[x:x + limit + 1] for x in all_nans] violate = np.array([x.all() & (x.size > limit) for x in violate]) return all_nans[violate] + limit @@ -1533,7 +1575,7 @@ def _interp_limit(invalid, limit): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): - inds = inds.view(pa.int64) + inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) @@ -1749,7 +1791,7 @@ def _maybe_box_datetimelike(value): if isinstance(value, np.datetime64): value = tslib.Timestamp(value) elif isinstance(value, np.timedelta64): - pass + value = tslib.Timedelta(value) return value @@ -1786,9 +1828,8 @@ def _possibly_convert_objects(values, convert_dates=True, if convert_timedeltas and values.dtype == np.object_: if convert_timedeltas == 'coerce': - from pandas.tseries.timedeltas import \ - _possibly_cast_to_timedelta - values = _possibly_cast_to_timedelta(values, coerce=True) + from pandas.tseries.timedeltas import to_timedelta + values = to_timedelta(values, coerce=True) # if we are all nans then leave me alone if not isnull(new_values).all(): @@ -1848,7 +1889,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.tseries.timedeltas import _possibly_cast_to_timedelta + from pandas.tseries.timedeltas import to_timedelta from pandas.tseries.tools import to_datetime if dtype is not None: @@ -1890,8 +1931,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): if is_datetime64: value = to_datetime(value, coerce=coerce).values elif is_timedelta64: - value = _possibly_cast_to_timedelta(value, - dtype=dtype) + value = to_timedelta(value, coerce=coerce).values except (AttributeError, ValueError): pass @@ -1908,7 +1948,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): value = value.astype(_NS_DTYPE) elif dtype.kind == 'm' and dtype != _TD_DTYPE: - value = _possibly_cast_to_timedelta(value) + value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this @@ -1920,15 +1960,24 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): return value -def _possibly_infer_to_datetimelike(value): - # we might have a array (or single object) that is datetime like, - # and no dtype is passed don't change the value unless we find a - # datetime/timedelta set +def _possibly_infer_to_datetimelike(value, convert_dates=False): + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes - # this is pretty strict in that a datetime/timedelta is REQUIRED - # in addition to possible nulls/string likes + ONLY strings are NOT datetimelike - # ONLY strings are NOT datetimelike + Parameters + ---------- + convert_dates : boolean, default False + if True try really hard to convert dates (such as datetime.date), other + leave inferred dtype 'date' alone + + """ v = value if not is_list_like(v): @@ -1955,22 +2004,13 @@ def _try_timedelta(v): try: return to_timedelta(v).values.reshape(shape) except: - - # this is for compat with numpy < 1.7 - # but string-likes will fail here - - from pandas.tseries.timedeltas import \ - _possibly_cast_to_timedelta - try: - return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape) - except: - return v + return v # do a quick inference for perf sample = v[:min(3,len(v))] inferred_type = lib.infer_dtype(sample) - if inferred_type in ['datetime', 'datetime64']: + if inferred_type in ['datetime', 'datetime64'] or (convert_dates and inferred_type in ['date']): value = _try_datetime(v) elif inferred_type in ['timedelta', 'timedelta64']: value = _try_timedelta(v) @@ -2070,11 +2110,6 @@ def _count_not_none(*args): # miscellaneous python tools -def rands(n): - """Generates a random alphanumeric string of length *n*""" - from random import Random - import string - return ''.join(Random().sample(string.ascii_letters + string.digits, n)) def adjoin(space, *lists): @@ -2296,20 +2331,47 @@ def is_iterator(obj): def is_number(obj): return isinstance(obj, (numbers.Number, np.number)) +def is_period_arraylike(arr): + """ return if we are period arraylike / PeriodIndex """ + if isinstance(arr, pd.PeriodIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'period' + return getattr(arr, 'inferred_type', None) == 'period' + +def is_datetime_arraylike(arr): + """ return if we are datetime arraylike / DatetimeIndex """ + if isinstance(arr, pd.DatetimeIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' + return getattr(arr, 'inferred_type', None) == 'datetime' + +def _coerce_to_dtype(dtype): + """ coerce a string / np.dtype to a dtype """ + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + else: + dtype = np.dtype(dtype) + return dtype def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype - if isinstance(arr_or_dtype, type): + elif isinstance(arr_or_dtype, type): return np.dtype(arr_or_dtype) + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtype() return arr_or_dtype.dtype def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type - if isinstance(arr_or_dtype, type): + elif isinstance(arr_or_dtype, type): return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtypeType return arr_or_dtype.dtype.type @@ -2339,7 +2401,6 @@ def is_datetime64_ns_dtype(arr_or_dtype): tipo = _get_dtype(arr_or_dtype) return tipo == _NS_DTYPE - def is_timedelta64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.timedelta64) @@ -2357,6 +2418,13 @@ def _is_datetime_or_timedelta_dtype(arr_or_dtype): needs_i8_conversion = _is_datetime_or_timedelta_dtype +def i8_boxer(arr_or_dtype): + """ return the scalar boxer for the dtype """ + if is_datetime64_dtype(arr_or_dtype): + return lib.Timestamp + elif is_timedelta64_dtype(arr_or_dtype): + return lambda x: lib.Timedelta(x,unit='ns') + raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype)) def is_numeric_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) @@ -2440,7 +2508,7 @@ def _get_callable_name(obj): # instead of the empty string in this case to allow # distinguishing between no name and a name of '' return None - + _string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, compat.text_type))) @@ -2459,7 +2527,7 @@ def _astype_nansafe(arr, dtype, copy=True): """ return a view if copy is False, but need to be very careful as the result shape could change! """ if not isinstance(dtype, np.dtype): - dtype = np.dtype(dtype) + dtype = _coerce_to_dtype(dtype) if is_datetime64_dtype(arr): if dtype == object: @@ -2474,7 +2542,7 @@ def _astype_nansafe(arr, dtype, copy=True): if dtype == np.int64: return arr.view(dtype) elif dtype == object: - return arr.astype(object) + return tslib.ints_to_pytimedelta(arr.view(np.int64)) # in py3, timedelta64[ns] are int64 elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or @@ -2696,27 +2764,38 @@ def _concat_compat(to_concat, axis=0): # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. if nonempty: + is_datetime64 = [x.dtype == _NS_DTYPE for x in nonempty] + is_timedelta64 = [x.dtype == _TD_DTYPE for x in nonempty] + if all(is_datetime64): - # work around NumPy 1.6 bug new_values = np.concatenate([x.view(np.int64) for x in nonempty], axis=axis) return new_values.view(_NS_DTYPE) - elif any(is_datetime64): + elif all(is_timedelta64): + new_values = np.concatenate([x.view(np.int64) for x in nonempty], + axis=axis) + return new_values.view(_TD_DTYPE) + elif any(is_datetime64) or any(is_timedelta64): to_concat = [_to_pydatetime(x) for x in nonempty] return np.concatenate(to_concat, axis=axis) def _to_pydatetime(x): + # coerce to an object dtyped + if x.dtype == _NS_DTYPE: shape = x.shape x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) x = x.reshape(shape) + elif x.dtype == _TD_DTYPE: + shape = x.shape + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) + x = x.reshape(shape) return x - def _where_compat(mask, arr1, arr2): if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8')) diff --git a/pandas/core/config.py b/pandas/core/config.py index 3e8d76500d128..60dc1d7d0341e 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -445,7 +445,7 @@ def register_option(key, defval, doc='', validator=None, cb=None): for k in path: if not bool(re.match('^' + tokenize.Name + '$', k)): raise ValueError("%s is not a valid identifier" % k) - if keyword.iskeyword(key): + if keyword.iskeyword(k): raise ValueError("%s is a python keyword" % k) cursor = _global_config diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index c32796cf082d4..a56d3b93d87da 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -33,28 +33,36 @@ pc_max_rows_doc = """ : int - This sets the maximum number of rows pandas should output when printing - out various output. For example, this value determines whether the repr() - for a dataframe prints out fully or just a summary repr. - 'None' value means unlimited. + If max_rows is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 and pandas will auto-detect + the height of the terminal and print a truncated object which fits + the screen height. The IPython notebook, IPython qtconsole, or + IDLE do not run in a terminal and hence it is not possible to do + correct auto-detection. """ pc_max_cols_doc = """ : int - max_rows and max_columns are used in __repr__() methods to decide if - to_string() or info() is used to render an object to a string. In case - python/IPython is running in a terminal this can be set to 0 and pandas - will correctly auto-detect the width the terminal and swap to a smaller - format in case all columns would not fit vertically. The IPython notebook, - IPython qtconsole, or IDLE do not run in a terminal and hence it is not - possible to do correct auto-detection. - 'None' value means unlimited. + If max_cols is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 and pandas will auto-detect + the width of the terminal and print a truncated object which fits + the screen width. The IPython notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence it is not possible to do + correct auto-detection. """ -pc_max_levels_doc = """ +pc_max_categories_doc = """ : int - This sets the maximum number of levels pandas should output when printing - out a `Categorical`. + This sets the maximum number of categories pandas should output when printing + out a `Categorical` or a Series of dtype "category". """ pc_max_info_cols_doc = """ @@ -195,6 +203,12 @@ Setting this to None/False restores the values to their initial value. """ +pc_memory_usage_doc = """ +: bool or None + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. +""" + style_backup = dict() @@ -229,7 +243,7 @@ def mpl_style_cb(key): validator=is_instance_factory((int, type(None)))) cf.register_option('max_rows', 60, pc_max_rows_doc, validator=is_instance_factory([type(None), int])) - cf.register_option('max_levels', 8, pc_max_levels_doc, validator=is_int) + cf.register_option('max_categories', 8, pc_max_categories_doc, validator=is_int) cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) cf.register_option('max_columns', 20, pc_max_cols_doc, validator=is_instance_factory([type(None), int])) @@ -266,6 +280,8 @@ def mpl_style_cb(key): # redirected to width, make defval identical cf.register_option('line_width', get_default_val('display.width'), pc_line_width_doc) + cf.register_option('memory_usage', True, pc_memory_usage_doc, + validator=is_instance_factory([type(None), bool])) cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/format.py b/pandas/core/format.py index 8f749d07296a7..89973754a861c 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1,28 +1,25 @@ - -#coding: utf-8 +# -*- coding: utf-8 -*- from __future__ import print_function # pylint: disable=W0141 import sys -import re from pandas.core.base import PandasObject -from pandas.core.common import adjoin, isnull, notnull +from pandas.core.common import adjoin, notnull from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u, OrderedDict) from pandas.util.terminal import get_terminal_size -from pandas.core.config import get_option, set_option, reset_option +from pandas.core.config import get_option, set_option import pandas.core.common as com import pandas.lib as lib -from pandas.tslib import iNaT +from pandas.tslib import iNaT, Timestamp, Timedelta import numpy as np import itertools import csv -from datetime import time from pandas.tseries.period import PeriodIndex, DatetimeIndex @@ -93,7 +90,7 @@ def _get_footer(self): footer += ', ' footer += "Length: %d" % len(self.categorical) - level_info = self.categorical._repr_level_info() + level_info = self.categorical._repr_categories_info() # Levels are added in a newline if footer: @@ -117,9 +114,11 @@ def to_string(self): return u('') fmt_values = self._get_formatted_values() - pad_space = 10 result = ['%s' % i for i in fmt_values] + result = [i.strip() for i in result] + result = u(', ').join(result) + result = [u('[')+result+u(']')] if self.footer: footer = self._get_footer() if footer: @@ -177,12 +176,11 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.cat._repr_level_info() + level_info = self.series.values._repr_categories_info() if footer: footer += "\n" footer += level_info - return compat.text_type(footer) def _get_formatted_index(self): @@ -250,7 +248,8 @@ class TableFormatter(object): @property def should_show_dimensions(self): - return self.show_dimensions is True or (self.show_dimensions == 'truncate' and self.is_truncated) + return self.show_dimensions is True or (self.show_dimensions == 'truncate' and + self.is_truncated) def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): @@ -321,30 +320,69 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self._chk_truncate() def _chk_truncate(self): + ''' + Checks whether the frame should be truncated. If so, slices + the frame up. + ''' from pandas.tools.merge import concat - truncate_h = self.max_cols and (len(self.columns) > self.max_cols) - truncate_v = self.max_rows and (len(self.frame) > self.max_rows) + # Column of which first element is used to determine width of a dot col + self.tr_size_col = -1 # Cut the data to the information actually printed max_cols = self.max_cols max_rows = self.max_rows + + if max_cols == 0 or max_rows == 0: # assume we are in the terminal (why else = 0) + (w, h) = get_terminal_size() + self.w = w + self.h = h + if self.max_rows == 0: + dot_row = 1 + prompt_row = 1 + if self.show_dimensions: + show_dimension_rows = 3 + n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row + max_rows_adj = self.h - n_add_rows # rows available to fill with actual data + self.max_rows_adj = max_rows_adj + + # Format only rows and columns that could potentially fit the screen + if max_cols == 0 and len(self.frame.columns) > w: + max_cols = w + if max_rows == 0 and len(self.frame) > h: + max_rows = h + + if not hasattr(self, 'max_rows_adj'): + self.max_rows_adj = max_rows + if not hasattr(self, 'max_cols_adj'): + self.max_cols_adj = max_cols + + max_cols_adj = self.max_cols_adj + max_rows_adj = self.max_rows_adj + + truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj) + truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj) + frame = self.frame if truncate_h: - if max_cols > 1: - col_num = (max_cols // 2) - frame = concat( (frame.iloc[:,:col_num],frame.iloc[:,-col_num:]),axis=1 ) - else: + if max_cols_adj == 0: + col_num = len(frame.columns) + elif max_cols_adj == 1: + frame = frame.iloc[:, :max_cols] col_num = max_cols - frame = frame.iloc[:,:max_cols] + else: + col_num = (max_cols_adj // 2) + frame = concat((frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1) self.tr_col_num = col_num if truncate_v: - if max_rows > 1: - row_num = max_rows // 2 - frame = concat( (frame.iloc[:row_num,:],frame.iloc[-row_num:,:]) ) - else: + if max_rows_adj == 0: + row_num = len(frame) + if max_rows_adj == 1: row_num = max_rows - frame = frame.iloc[:max_rows,:] + frame = frame.iloc[:max_rows, :] + else: + row_num = max_rows_adj // 2 + frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) self.tr_row_num = row_num self.tr_frame = frame @@ -360,13 +398,12 @@ def _to_str_columns(self): frame = self.tr_frame # may include levels names also - str_index = self._get_formatted_index(frame) + str_index = self._get_formatted_index(frame) str_columns = self._get_formatted_column_labels(frame) if self.header: stringified = [] - col_headers = frame.columns for i, c in enumerate(frame): cheader = str_columns[i] max_colwidth = max(self.col_space or 0, @@ -377,7 +414,6 @@ def _to_str_columns(self): fmt_values = _make_fixed_width(fmt_values, self.justify, minimum=max_colwidth) - max_len = max(np.max([_strlen(x) for x in fmt_values]), max_colwidth) if self.justify == 'left': @@ -389,9 +425,9 @@ def _to_str_columns(self): else: stringified = [] for i, c in enumerate(frame): - formatter = self._get_formatter(i) fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify) + fmt_values = _make_fixed_width(fmt_values, self.justify, + minimum=(self.col_space or 0)) stringified.append(fmt_values) @@ -405,12 +441,12 @@ def _to_str_columns(self): if truncate_h: col_num = self.tr_col_num - col_width = len(strcols[col_num][0]) # infer from column header - strcols.insert(col_num + 1, ['...'.center(col_width)] * (len(str_index))) + col_width = len(strcols[self.tr_size_col][0]) # infer from column header + strcols.insert(self.tr_col_num + 1, ['...'.center(col_width)] * (len(str_index))) if truncate_v: n_header_rows = len(str_index) - len(frame) row_num = self.tr_row_num - for ix,col in enumerate(strcols): + for ix, col in enumerate(strcols): cwidth = len(strcols[ix][row_num]) # infer from above row is_dot_col = False if truncate_h: @@ -423,19 +459,19 @@ def _to_str_columns(self): if ix == 0: dot_str = my_str.ljust(cwidth) elif is_dot_col: + cwidth = len(strcols[self.tr_size_col][0]) dot_str = my_str.center(cwidth) else: dot_str = my_str.rjust(cwidth) strcols[ix].insert(row_num + n_header_rows, dot_str) - return strcols def to_string(self): """ Render a DataFrame to a console-friendly tabular output. """ - + from pandas import Series frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -446,10 +482,40 @@ def to_string(self): text = info_line else: strcols = self._to_str_columns() - if self.line_width is None: + if self.line_width is None: # no need to wrap around just print the whole frame text = adjoin(1, *strcols) - else: + elif not isinstance(self.max_cols, int) or self.max_cols > 0: # need to wrap around text = self._join_multiline(*strcols) + else: # max_cols == 0. Try to fit frame to terminal + text = adjoin(1, *strcols).split('\n') + row_lens = Series(text).apply(len) + max_len_col_ix = np.argmax(row_lens) + max_len = row_lens[max_len_col_ix] + headers = [ele[0] for ele in strcols] + # Size of last col determines dot col size. See `self._to_str_columns + size_tr_col = len(headers[self.tr_size_col]) + max_len += size_tr_col # Need to make space for largest row plus truncate dot col + dif = max_len - self.w + adj_dif = dif + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + adj_dif -= (col_len + 1) # adjoin adds one + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + max_cols_adj = n_cols - self.index # subtract index column + self.max_cols_adj = max_cols_adj + + # Call again _chk_truncate to cut frame appropriately + # and then generate string representation + self._chk_truncate() + strcols = self._to_str_columns() + text = adjoin(1, *strcols) self.buf.writelines(text) @@ -471,8 +537,8 @@ def _join_multiline(self, *strcols): col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) - if self.max_rows and len(self.frame) > self.max_rows: - nrows = self.max_rows + 1 + if self.truncate_v: + nrows = self.max_rows_adj + 1 else: nrows = len(self.frame) @@ -496,7 +562,8 @@ def to_latex(self, column_format=None, longtable=False): Render a DataFrame to a LaTeX tabular/longtable environment output. """ self.escape = self.kwds.get('escape', True) - #TODO: column_format is not settable in df.to_latex + + # TODO: column_format is not settable in df.to_latex def get_col_type(dtype): if issubclass(dtype.type, np.number): return 'r' @@ -513,12 +580,22 @@ def get_col_type(dtype): else: strcols = self._to_str_columns() + if self.index and isinstance(self.frame.index, MultiIndex): + clevels = self.frame.columns.nlevels + strcols.pop(0) + name = any(self.frame.columns.names) + for i, lev in enumerate(self.frame.index.levels): + lev2 = lev.format(name=name) + width = len(lev2[0]) + lev3 = [' ' * width] * clevels + lev2 + strcols.insert(i, lev3) + if column_format is None: dtypes = self.frame.dtypes.values + column_format = ''.join(map(get_col_type, dtypes)) if self.index: - column_format = 'l%s' % ''.join(map(get_col_type, dtypes)) - else: - column_format = '%s' % ''.join(map(get_col_type, dtypes)) + index_format = 'l' * self.frame.index.nlevels + column_format = index_format + column_format elif not isinstance(column_format, compat.string_types): # pragma: no cover raise AssertionError('column_format must be str or unicode, not %s' @@ -532,7 +609,7 @@ def write(buf, frame, column_format, strcols, longtable=False): buf.write('\\begin{longtable}{%s}\n' % column_format) buf.write('\\toprule\n') - nlevels = frame.index.nlevels + nlevels = frame.columns.nlevels for i, row in enumerate(zip(*strcols)): if i == nlevels: buf.write('\\midrule\n') # End of header @@ -546,7 +623,7 @@ def write(buf, frame, column_format, strcols, longtable=False): buf.write('\\bottomrule\n') buf.write('\\endlastfoot\n') if self.escape: - crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first + crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first .replace('_', '\\_') .replace('%', '\\%') .replace('$', '\\$') @@ -601,7 +678,7 @@ def to_html(self, classes=None): raise TypeError('buf is not a file name and it has no write ' ' method') - def _get_formatted_column_labels(self,frame): + def _get_formatted_column_labels(self, frame): from pandas.core.index import _sparsify def is_numeric_dtype(dtype): @@ -613,10 +690,17 @@ def is_numeric_dtype(dtype): fmt_columns = columns.format(sparsify=False, adjoin=False) fmt_columns = lzip(*fmt_columns) dtypes = self.frame.dtypes.values + + # if we have a Float level, they don't use leading space at all + restrict_formatting = any([l.is_floating for l in columns.levels]) need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = list(zip(*[ - [' ' + y if y not in self.formatters and need_leadsp[x] - else y for y in x] for x in fmt_columns])) + + def space_format(x, y): + if y not in self.formatters and need_leadsp[x] and not restrict_formatting: + return ' ' + y + return y + + str_columns = list(zip(*[[space_format(x, y) for y in x] for x in fmt_columns])) if self.sparsify: str_columns = _sparsify(str_columns) @@ -635,6 +719,7 @@ def is_numeric_dtype(dtype): for x in str_columns: x.append('') + # self.str_columns = str_columns return str_columns @property @@ -645,8 +730,8 @@ def has_index_names(self): def has_column_names(self): return _has_names(self.frame.columns) - def _get_formatted_index(self,frame): - # Note: this is only used by to_string(), not by to_html(). + def _get_formatted_index(self, frame): + # Note: this is only used by to_string() and to_latex(), not by to_html(). index = frame.index columns = frame.columns @@ -703,7 +788,8 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None): self.max_rows = max_rows or len(self.fmt.frame) self.max_cols = max_cols or len(self.fmt.columns) self.show_dimensions = self.fmt.show_dimensions - self.is_truncated = self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns) + self.is_truncated = (self.max_rows < len(self.fmt.frame) or + self.max_cols < len(self.fmt.columns)) def write(self, s, indent=0): rs = com.pprint_thing(s) @@ -837,7 +923,7 @@ def _column_header(): if self.fmt.sparsify: recs_new = {} # Increment tags after ... col. - for tag,span in list(records.items()): + for tag, span in list(records.items()): if tag >= ins_col: recs_new[tag + 1] = span elif tag + span > ins_col: @@ -846,8 +932,8 @@ def _column_header(): values = values[:ins_col] + (u('...'),) + \ values[ins_col:] else: # sparse col headers do not receive a ... - values = values[:ins_col] + \ - (values[ins_col - 1],) + values[ins_col:] + values = (values[:ins_col] + (values[ins_col - 1],) + + values[ins_col:]) else: recs_new[tag] = span # if ins_col lies between tags, all col headers get ... @@ -861,7 +947,7 @@ def _column_header(): records[ins_col] = 1 else: recs_new = {} - for tag,span in list(records.items()): + for tag, span in list(records.items()): if tag >= ins_col: recs_new[tag + 1] = span else: @@ -947,10 +1033,11 @@ def _write_regular_rows(self, fmt_values, indent): else: index_values = self.fmt.tr_frame.index.format() + row = [] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = [ '...' for ele in row ] + str_sep_row = ['...' for ele in row] self.write_tr(str_sep_row, indent, self.indent_delta, tags=None, nindex_levels=1) @@ -974,15 +1061,13 @@ def _write_hierarchical_rows(self, fmt_values, indent): nrows = len(frame) row_levels = self.frame.index.nlevels - idx_values = frame.index.format(sparsify=False, adjoin=False, - names=False) + idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: # GH3547 sentinel = com.sentinel_factory() - levels = frame.index.format(sparsify=sentinel, - adjoin=False, names=False) + levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = _get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -990,28 +1075,28 @@ def _write_hierarchical_rows(self, fmt_values, indent): # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num - for lnum,records in enumerate(level_lengths): + for lnum, records in enumerate(level_lengths): rec_new = {} - for tag,span in list(records.items()): + for tag, span in list(records.items()): if tag >= ins_row: rec_new[tag + 1] = span elif tag + span > ins_row: rec_new[tag] = span + 1 dot_row = list(idx_values[ins_row - 1]) dot_row[-1] = u('...') - idx_values.insert(ins_row,tuple(dot_row)) + idx_values.insert(ins_row, tuple(dot_row)) else: rec_new[tag] = span # If ins_row lies between tags, all cols idx cols receive ... if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: - idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths))) + idx_values.insert(ins_row, tuple([u('...')]*len(level_lengths))) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 for ix_col in range(len(fmt_values)): - fmt_values[ix_col].insert(ins_row,'...') + fmt_values[ix_col].insert(ins_row, '...') nrows += 1 for i in range(nrows): @@ -1049,6 +1134,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=frame.index.nlevels) + def _get_level_lengths(levels, sentinel=''): from itertools import groupby @@ -1229,10 +1315,10 @@ def _helper_csv(self, writer, na_rep=None, cols=None, writer.writerow(encoded_cols) if date_format is None: - date_formatter = lambda x: lib.Timestamp(x)._repr_base + date_formatter = lambda x: Timestamp(x)._repr_base else: def strftime_with_nulls(x): - x = lib.Timestamp(x) + x = Timestamp(x) if notnull(x): return x.strftime(date_format) @@ -1272,7 +1358,7 @@ def strftime_with_nulls(x): if float_format is not None and com.is_float(val): val = float_format % val - elif isinstance(val, (np.datetime64, lib.Timestamp)): + elif isinstance(val, (np.datetime64, Timestamp)): val = date_formatter(val) row_fields.append(val) @@ -1302,7 +1388,7 @@ def save(self): self.writer = csv.writer(f, **writer_kwargs) if self.engine == 'python': - # to be removed in 0.13 + # to be removed in 0.13 self._helper_csv(self.writer, na_rep=self.na_rep, float_format=self.float_format, cols=self.cols, header=self.header, @@ -1512,9 +1598,9 @@ def _format_value(self, val): val = self.na_rep elif com.is_float(val): if np.isposinf(val): - val = '-%s' % self.inf_rep - elif np.isneginf(val): val = self.inf_rep + elif np.isneginf(val): + val = '-%s' % self.inf_rep elif self.float_format is not None: val = float(self.float_format % val) return val @@ -1738,7 +1824,7 @@ def get_formatted_cells(self): cell.val = self._format_value(cell.val) yield cell -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Array formatters @@ -1907,10 +1993,10 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): self.date_format = date_format def _format_strings(self): - formatter = self.formatter or _get_format_datetime64_from_values( - self.values, - nat_rep=self.nat_rep, - date_format=self.date_format) + formatter = (self.formatter or + _get_format_datetime64_from_values(self.values, + nat_rep=self.nat_rep, + date_format=self.date_format)) fmt_values = [formatter(x) for x in self.values] @@ -1921,8 +2007,8 @@ def _format_datetime64(x, tz=None, nat_rep='NaT'): if x is None or lib.checknull(x): return nat_rep - if tz is not None or not isinstance(x, lib.Timestamp): - x = lib.Timestamp(x, tz=tz) + if tz is not None or not isinstance(x, Timestamp): + x = Timestamp(x, tz=tz) return str(x) @@ -1931,8 +2017,8 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): if x is None or lib.checknull(x): return nat_rep - if not isinstance(x, lib.Timestamp): - x = lib.Timestamp(x) + if not isinstance(x, Timestamp): + x = Timestamp(x) if date_format: return x.strftime(date_format) @@ -1941,21 +2027,26 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): def _is_dates_only(values): - for d in values: - if isinstance(d, np.datetime64): - d = lib.Timestamp(d) - - if d is not None and not lib.checknull(d) and d._has_time_component(): - return False - return True + # return a boolean if we are only dates (and don't have a timezone) + from pandas import DatetimeIndex + values = DatetimeIndex(values) + if values.tz is not None: + return False + values_int = values.asi8 + consider_values = values_int != iNaT + one_day_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + if even_days: + return True + return False def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): if is_dates_only: return lambda x, tz=None: _format_datetime64_dateonly(x, - nat_rep=nat_rep, - date_format=date_format) + nat_rep=nat_rep, + date_format=date_format) else: return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) @@ -1971,35 +2062,53 @@ def _get_format_datetime64_from_values(values, class Timedelta64Formatter(GenericArrayFormatter): - def _format_strings(self): - formatter = self.formatter or _get_format_timedelta64(self.values) + def __init__(self, values, nat_rep='NaT', box=False, **kwargs): + super(Timedelta64Formatter, self).__init__(values, **kwargs) + self.nat_rep = nat_rep + self.box = box + def _format_strings(self): + formatter = self.formatter or _get_format_timedelta64(self.values, nat_rep=self.nat_rep, + box=self.box) fmt_values = [formatter(x) for x in self.values] - return fmt_values -def _get_format_timedelta64(values): +def _get_format_timedelta64(values, nat_rep='NaT', box=False): + """ + Return a formatter function for a range of timedeltas. + These will all have the same format argument + + If box, then show the return in quotes + """ + values_int = values.astype(np.int64) consider_values = values_int != iNaT - one_day_in_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, values_int % one_day_in_nanos != 0).sum() == 0 - all_sub_day = np.logical_and(consider_values, np.abs(values_int) >= one_day_in_nanos).sum() == 0 + one_day_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + all_sub_day = np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 - format_short = even_days or all_sub_day - format = "short" if format_short else "long" + if even_days: + format = 'even_day' + elif all_sub_day: + format = 'sub_day' + else: + format = 'long' - def impl(x): + def _formatter(x): if x is None or lib.checknull(x): - return 'NaT' - elif format_short and com.is_integer(x) and x.view('int64') == 0: - return "0 days" if even_days else "00:00:00" - else: - return lib.repr_timedelta64(x, format=format) + return nat_rep + + if not isinstance(x, Timedelta): + x = Timedelta(x) + result = x._repr_base(format=format) + if box: + result = "'{0}'".format(result) + return result - return impl + return _formatter def _make_fixed_width(strings, justify='right', minimum=None): @@ -2082,7 +2191,7 @@ def _has_names(index): return index.name is not None -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Global formatting options _initial_defencoding = None diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 352ac52281c54..d90ef76ddfa5e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,7 +26,7 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, _infer_dtype_from_scalar, _values_from_object, - is_list_like, _get_dtype) + is_list_like, _get_dtype, _maybe_box_datetimelike) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_droplevels, @@ -43,6 +43,7 @@ from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) from pandas import compat +from pandas.sparse.array import SparseArray from pandas.util.decorators import deprecate, Appender, Substitution, \ deprecate_kwarg @@ -639,19 +640,25 @@ def from_dict(cls, data, orient='columns', dtype=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_dict(self, outtype='dict'): - """ - Convert DataFrame to dictionary. + @deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient') + def to_dict(self, orient='dict'): + """Convert DataFrame to dictionary. Parameters ---------- - outtype : str {'dict', 'list', 'series', 'records'} - Determines the type of the values of the dictionary. The - default `dict` is a nested dictionary {column -> {index -> value}}. - `list` returns {column -> list(values)}. `series` returns - {column -> Series(values)}. `records` returns [{columns -> value}]. - Abbreviations are allowed. + orient : str {'dict', 'list', 'series', 'split', 'records'} + Determines the type of the values of the dictionary. + + - dict (default) : dict like {column -> {index -> value}} + - list : dict like {column -> [values]} + - series : dict like {column -> Series(values)} + - split : dict like + {index -> [index], columns -> [columns], data -> [values]} + - records : list like + [{column -> value}, ... , {column -> value}] + Abbreviations are allowed. `s` indicates `series` and `sp` + indicates `split`. Returns ------- @@ -660,13 +667,17 @@ def to_dict(self, outtype='dict'): if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " "columns will be omitted.", UserWarning) - if outtype.lower().startswith('d'): + if orient.lower().startswith('d'): return dict((k, v.to_dict()) for k, v in compat.iteritems(self)) - elif outtype.lower().startswith('l'): + elif orient.lower().startswith('l'): return dict((k, v.tolist()) for k, v in compat.iteritems(self)) - elif outtype.lower().startswith('s'): + elif orient.lower().startswith('sp'): + return {'index': self.index.tolist(), + 'columns': self.columns.tolist(), + 'data': self.values.tolist()} + elif orient.lower().startswith('s'): return dict((k, v) for k, v in compat.iteritems(self)) - elif outtype.lower().startswith('r'): + elif orient.lower().startswith('r'): return [dict((k, v) for k, v in zip(self.columns, row)) for row in self.values] else: # pragma: no cover @@ -1389,7 +1400,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, if buf is None: return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None): + def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None): """ Concise summary of a DataFrame. @@ -1403,6 +1414,12 @@ def info(self, verbose=None, buf=None, max_cols=None): max_cols : int, default None Determines whether full summary or short summary is printed. None follows the `display.max_info_columns` setting. + memory_usage : boolean, default None + Specifies whether total memory usage of the DataFrame + elements (including index) should be displayed. None follows + the `display.memory_usage` setting. True or False overrides + the `display.memory_usage` setting. Memory usage is shown in + human-readable units (base-2 representation). """ from pandas.core.format import _put_lines @@ -1461,6 +1478,14 @@ def _verbose_repr(): def _non_verbose_repr(): lines.append(self.columns.summary(name='Columns')) + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: + if num < 1024.0: + return "%3.1f%s %s" % (num, size_qualifier, x) + num /= 1024.0 + return "%3.1f%s %s" % (num, size_qualifier, 'PB') + if verbose: _verbose_repr() elif verbose is False: # specifically set to False, not nesc None @@ -1474,8 +1499,51 @@ def _non_verbose_repr(): counts = self.get_dtype_counts() dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))] lines.append('dtypes: %s' % ', '.join(dtypes)) + if memory_usage is None: + memory_usage = get_option('display.memory_usage') + if memory_usage: # append memory usage of df to display + # size_qualifier is just a best effort; not guaranteed to catch all + # cases (e.g., it misses categorical data even with object + # categories) + size_qualifier = ('+' if 'object' in counts + or self.index.dtype.kind == 'O' else '') + mem_usage = self.memory_usage(index=True).sum() + lines.append("memory usage: %s\n" % + _sizeof_fmt(mem_usage, size_qualifier)) _put_lines(buf, lines) + def memory_usage(self, index=False): + """Memory usage of DataFrame columns. + + Parameters + ---------- + index : bool + Specifies whether to include memory usage of DataFrame's + index in returned Series. If `index=True` (default is False) + the first index of the Series is `Index`. + + Returns + ------- + sizes : Series + A series with column names as index and memory usage of + columns with units of bytes. + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array. + + See Also + -------- + numpy.ndarray.nbytes + """ + result = Series([ c.values.nbytes for col, c in self.iteritems() ], + index=self.columns) + if index: + result = Series(self.index.values.nbytes, + index=['Index']).append(result) + return result + def transpose(self): """Transpose index and columns""" return super(DataFrame, self).transpose(1, 0) @@ -1538,7 +1606,7 @@ def get_value(self, index, col, takeable=False): if takeable: series = self._iget_item_cache(col) - return series.values[index] + return _maybe_box_datetimelike(series.values[index]) series = self._get_item_cache(col) engine = self.index._engine @@ -1902,6 +1970,7 @@ def select_dtypes(self, include=None, exclude=None): this will return *all* object dtype columns * See the `numpy dtype hierarchy `__ + * To select Pandas categorical dtypes, use 'category' Examples -------- @@ -2141,6 +2210,15 @@ def reindexer(value): value = reindexer(value) elif isinstance(value, DataFrame): + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, MultiIndex) and key in self.columns: + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, Series, np.ndarray, Index)): + cols = _maybe_droplevels(self.columns[loc], key) + if len(cols) and not cols.equals(value.columns): + value = value.reindex_axis(cols, axis=1) + # now align rows value = reindexer(value).T elif isinstance(value, Categorical): @@ -2164,8 +2242,8 @@ def reindexer(value): value = np.repeat(value, len(self.index)).astype(dtype) value = com._possibly_cast_to_datetime(value, dtype) - # return categoricals directly - if isinstance(value, Categorical): + # return unconsolidatables directly + if isinstance(value, (Categorical, SparseArray)): return value # broadcast across multiple columns if necessary @@ -2258,8 +2336,7 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy): def _reindex_index(self, new_index, method, copy, level, fill_value=NA, limit=None): new_index, indexer = self.index.reindex(new_index, method, level, - limit=limit, - copy_if_needed=True) + limit=limit) return self._reindex_with_indexers({0: [new_index, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False) @@ -2267,8 +2344,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=NA, def _reindex_columns(self, new_columns, copy, level, fill_value=NA, limit=None): new_columns, indexer = self.columns.reindex(new_columns, level=level, - limit=limit, - copy_if_needed=True) + limit=limit) return self._reindex_with_indexers({1: [new_columns, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False) @@ -2509,7 +2585,6 @@ def _maybe_casted_values(index, labels=None): if not inplace: return new_obj - delevel = deprecate('delevel', reset_index) #---------------------------------------------------------------------- # Reindex-based selection methods @@ -2551,7 +2626,11 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, agg_obj = self if subset is not None: ax = self._get_axis(agg_axis) - agg_obj = self.take(ax.get_indexer_for(subset),axis=agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check,subset))) + agg_obj = self.take(indices,axis=agg_axis) count = agg_obj.count(axis=agg_axis) @@ -2770,6 +2849,12 @@ def trans(v): na_position=na_position) elif isinstance(labels, MultiIndex): + + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + if not labels.is_lexsorted(): + labels = MultiIndex.from_tuples(labels.values) + indexer = _lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) indexer = com._ensure_platform_int(indexer) @@ -3641,8 +3726,9 @@ def applymap(self, func): # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): - if com.is_datetime64_dtype(x): - x = lib.map_infer(_values_from_object(x), lib.Timestamp) + if com.needs_i8_conversion(x): + f = com.i8_boxer(x) + x = lib.map_infer(_values_from_object(x), f) return lib.map_infer(_values_from_object(x), func) return self.apply(infer) @@ -3682,7 +3768,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): 'ignore_index=True') index = None if other.name is None else [other.name] - combined_columns = self.columns.tolist() + ((self.columns | other.index) - self.columns).tolist() + combined_columns = self.columns.tolist() + self.columns.union(other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), index=index, columns=combined_columns).convert_objects() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5064545404fb0..53abfe10fe8ea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -23,7 +23,7 @@ _maybe_box_datetimelike, ABCSeries, SettingWithCopyError, SettingWithCopyWarning) import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, Substitution +from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config # goal is to be able to define the docs close to function, while still being @@ -105,7 +105,7 @@ def _validate_dtype(self, dtype): """ validate the passed dtype """ if dtype is not None: - dtype = np.dtype(dtype) + dtype = com._coerce_to_dtype(dtype) # a compound dtype if dtype.kind == 'V': @@ -915,8 +915,8 @@ def to_msgpack(self, path_or_buf=None, **kwargs): from pandas.io import packers return packers.to_msgpack(path_or_buf, self, **kwargs) - def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True, - index_label=None): + def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', + index=True, index_label=None, chunksize=None): """ Write records stored in a DataFrame to a SQL database. @@ -932,6 +932,9 @@ def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True, The flavor of SQL to use. Ignored when using SQLAlchemy engine. 'mysql' is deprecated and will be removed in future versions, but it will be further supported through SQLAlchemy engines. + schema : string, default None + Specify the schema (if database flavor supports this). If None, use + default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. @@ -942,12 +945,15 @@ def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True, Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. """ from pandas.io import sql sql.to_sql( - self, name, con, flavor=flavor, if_exists=if_exists, index=index, - index_label=index_label) + self, name, con, flavor=flavor, schema=schema, if_exists=if_exists, + index=index, index_label=index_label, chunksize=chunksize) def to_pickle(self, path): """ @@ -1038,7 +1044,7 @@ def get(self, key, default=None): """ try: return self[key] - except (KeyError, ValueError): + except (KeyError, ValueError, IndexError): return default def __getitem__(self, item): @@ -1102,9 +1108,21 @@ def _is_view(self): """ boolean : return if I am a view of another array """ return self._data.is_view - def _maybe_update_cacher(self, clear=False): - """ see if we need to update our parent cacher - if clear, then clear our cache """ + def _maybe_update_cacher(self, clear=False, verify_is_copy=True): + """ + + see if we need to update our parent cacher + if clear, then clear our cache + + Parameters + ---------- + clear : boolean, default False + clear the item cache + verify_is_copy : boolean, default True + provide is_copy checks + + """ + cacher = getattr(self, '_cacher', None) if cacher is not None: ref = cacher[1]() @@ -1119,8 +1137,8 @@ def _maybe_update_cacher(self, clear=False): except: pass - # check if we are a copy - self._check_setitem_copy(stacklevel=5, t='referant') + if verify_is_copy: + self._check_setitem_copy(stacklevel=5, t='referant') if clear: self._clear_item_cache() @@ -1377,7 +1395,7 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): xs is only for getting, not setting values. MultiIndex Slicers is a generic way to get/set values on any level or levels - it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` + it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` """ if copy is not None: @@ -1558,14 +1576,23 @@ def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): else: return result - def _update_inplace(self, result): - "replace self internals with result." + def _update_inplace(self, result, verify_is_copy=True): + """ + replace self internals with result. + + Parameters + ---------- + verify_is_copy : boolean, default True + provide is_copy checks + + """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. + self._reset_cache() self._clear_item_cache() self._data = getattr(result,'_data',result) - self._maybe_update_cacher() + self._maybe_update_cacher(verify_is_copy=verify_is_copy) def add_prefix(self, prefix): """ @@ -1622,6 +1649,7 @@ def sort_index(self, axis=0, ascending=True): new_axis = labels.take(sort_index) return self.reindex(**{axis_name: new_axis}) + _shared_docs['reindex'] = """ Conform %(klass)s to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object @@ -1700,15 +1728,11 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy): if labels is None: continue - # convert to an index if we are not a multi-selection ax = self._get_axis(a) - if level is None: - labels = _ensure_index(labels) - - axis = self._get_axis_number(a) new_index, indexer = ax.reindex( labels, level=level, limit=limit, method=method) + axis = self._get_axis_number(a) obj = obj._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, fill_value=fill_value, limit=limit, copy=copy, @@ -1769,8 +1793,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, axis_name = self._get_axis_name(axis) axis_values = self._get_axis(axis_name) method = com._clean_fill_method(method) - new_index, indexer = axis_values.reindex( - labels, method, level, limit=limit, copy_if_needed=True) + new_index, indexer = axis_values.reindex(labels, method, level, + limit=limit) return self._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, fill_value=fill_value, limit=limit, copy=copy) @@ -2224,10 +2248,10 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap - value : scalar, dict, or Series - Value to use to fill holes (e.g. 0), alternately a dict/Series of + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each index (for a Series) or - column (for a DataFrame). (values not in the dict/Series will not be + column (for a DataFrame). (values not in the dict/Series/DataFrame will not be filled). This value cannot be a list. axis : {0, 1}, default 0 * 0: fill column-by-column @@ -2259,6 +2283,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, axis = self._get_axis_number(axis) method = com._clean_fill_method(method) + from pandas import DataFrame if value is None: if method is None: raise ValueError('must specify a fill method or value') @@ -2301,10 +2326,14 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, if len(self._get_axis(axis)) == 0: return self - if self.ndim == 1 and value is not None: + if self.ndim == 1: if isinstance(value, (dict, com.ABCSeries)): from pandas import Series value = Series(value) + elif not com.is_list_like(value): + pass + else: + raise ValueError("invalid fill value with a %s" % type(value)) new_data = self._data.fillna(value=value, limit=limit, @@ -2324,11 +2353,15 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, obj = result[k] obj.fillna(v, limit=limit, inplace=True) return result - else: + elif not com.is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, downcast=downcast) + elif isinstance(value, DataFrame) and self.ndim == 2: + new_data = self.where(self.notnull(), value) + else: + raise ValueError("invalid fill value with a %s" % type(value)) if inplace: self._update_inplace(new_data) @@ -3105,9 +3138,13 @@ def _align_series(self, other, join='outer', axis=None, level=None, raise ValueError('cannot align series to a series other than ' 'axis 0') - join_index, lidx, ridx = self.index.join(other.index, how=join, - level=level, - return_indexers=True) + # equal + if self.index.equals(other.index): + join_index, lidx, ridx = None, None, None + else: + join_index, lidx, ridx = self.index.join(other.index, how=join, + level=level, + return_indexers=True) left_result = self._reindex_indexer(join_index, lidx, copy) right_result = other._reindex_indexer(join_index, ridx, copy) @@ -3553,7 +3590,10 @@ def _tz_convert(ax, tz): result.set_axis(axis,ax) return result.__finalize__(self) - def tz_localize(self, tz, axis=0, level=None, copy=True, infer_dst=False): + @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous', + mapping={True: 'infer', False: 'raise'}) + def tz_localize(self, tz, axis=0, level=None, copy=True, + ambiguous='raise'): """ Localize tz-naive TimeSeries to target time zone @@ -3566,8 +3606,14 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, infer_dst=False): must be None copy : boolean, default True Also make a copy of the underlying data - infer_dst : boolean, default False - Attempt to infer fall dst-transition times based on order + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + - 'infer' will attempt to infer fall dst-transition hours based on order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times + infer_dst : boolean, default False (DEPRECATED) + Attempt to infer fall dst-transition hours based on order Returns ------- @@ -3575,7 +3621,7 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, infer_dst=False): axis = self._get_axis_number(axis) ax = self._get_axis(axis) - def _tz_localize(ax, tz, infer_dst): + def _tz_localize(ax, tz, ambiguous): if not hasattr(ax, 'tz_localize'): if len(ax) > 0: ax_name = self._get_axis_name(axis) @@ -3584,19 +3630,19 @@ def _tz_localize(ax, tz, infer_dst): else: ax = DatetimeIndex([],tz=tz) else: - ax = ax.tz_localize(tz, infer_dst=infer_dst) + ax = ax.tz_localize(tz, ambiguous=ambiguous) return ax # if a level is given it must be a MultiIndex level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - new_level = _tz_localize(ax.levels[level], tz, infer_dst) + new_level = _tz_localize(ax.levels[level], tz, ambiguous) ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): raise ValueError("The level {0} is not valid".format(level)) - ax = _tz_localize(ax, tz, infer_dst) + ax = _tz_localize(ax, tz, ambiguous) result = self._constructor(self._data, copy=copy) result.set_axis(axis,ax) @@ -3629,6 +3675,17 @@ def abs(self): The percentiles to include in the output. Should all be in the interval [0, 1]. By default `percentiles` is [.25, .5, .75], returning the 25th, 50th, and 75th percentiles. + include, exclude : list-like, 'all', or None (default) + Specify the form of the returned result. Either: + + - None to both (default). The result will include only numeric-typed + columns or, if none are, only categorical columns. + - A list of dtypes or strings to be included/excluded. + To select all numeric types use numpy numpy.number. To select + categorical objects use type object. See also the select_dtypes + documentation. eg. df.describe(include=['O']) + - If include is the string 'all', the output column-set will + match the input one. Returns ------- @@ -3636,20 +3693,33 @@ def abs(self): Notes ----- - For numeric dtypes the index includes: count, mean, std, min, + The output DataFrame index depends on the requested dtypes: + + For numeric dtypes, it will include: count, mean, std, min, max, and lower, 50, and upper percentiles. - If self is of object dtypes (e.g. timestamps or strings), the output + For object dtypes (e.g. timestamps or strings), the index will include the count, unique, most common, and frequency of the most common. Timestamps also include the first and last items. + For mixed dtypes, the index will be the union of the corresponding + output types. Non-applicable entries will be filled with NaN. + Note that mixed-dtype outputs can only be returned from mixed-dtype + inputs and appropriate use of the include/exclude arguments. + If multiple values have the highest count, then the `count` and `most common` pair will be arbitrarily chosen from among those with the highest count. + + The include, exclude arguments are ignored for Series. + + See also + -------- + DataFrame.select_dtypes """ @Appender(_shared_docs['describe'] % _shared_doc_kwargs) - def describe(self, percentile_width=None, percentiles=None): + def describe(self, percentile_width=None, percentiles=None, include=None, exclude=None ): if self.ndim >= 3: msg = "describe is not implemented on on Panel or PanelND objects." raise NotImplementedError(msg) @@ -3686,16 +3756,6 @@ def describe(self, percentile_width=None, percentiles=None): uh = percentiles[percentiles > .5] percentiles = np.hstack([lh, 0.5, uh]) - # dtypes: numeric only, numeric mixed, objects only - data = self._get_numeric_data() - if self.ndim > 1: - if len(data._info_axis) == 0: - is_object = True - else: - is_object = False - else: - is_object = not self._is_numeric_mixed_type - def pretty_name(x): x *= 100 if x == int(x): @@ -3704,10 +3764,12 @@ def pretty_name(x): return '%.1f%%' % x def describe_numeric_1d(series, percentiles): - return ([series.count(), series.mean(), series.std(), - series.min()] + - [series.quantile(x) for x in percentiles] + - [series.max()]) + stat_index = (['count', 'mean', 'std', 'min'] + + [pretty_name(x) for x in percentiles] + ['max']) + d = ([series.count(), series.mean(), series.std(), series.min()] + + [series.quantile(x) for x in percentiles] + [series.max()]) + return pd.Series(d, index=stat_index, name=series.name) + def describe_categorical_1d(data): names = ['count', 'unique'] @@ -3720,44 +3782,49 @@ def describe_categorical_1d(data): names += ['top', 'freq'] result += [top, freq] - elif issubclass(data.dtype.type, np.datetime64): + elif com.is_datetime64_dtype(data): asint = data.dropna().values.view('i8') - names += ['first', 'last', 'top', 'freq'] - result += [lib.Timestamp(asint.min()), - lib.Timestamp(asint.max()), - lib.Timestamp(top), freq] - - return pd.Series(result, index=names) - - if is_object: - if data.ndim == 1: - return describe_categorical_1d(self) + names += ['top', 'freq', 'first', 'last'] + result += [lib.Timestamp(top), freq, + lib.Timestamp(asint.min()), + lib.Timestamp(asint.max())] + + return pd.Series(result, index=names, name=data.name) + + def describe_1d(data, percentiles): + if com.is_numeric_dtype(data): + return describe_numeric_1d(data, percentiles) + elif com.is_timedelta64_dtype(data): + return describe_numeric_1d(data, percentiles) else: - result = pd.DataFrame(dict((k, describe_categorical_1d(v)) - for k, v in compat.iteritems(self)), - columns=self._info_axis, - index=['count', 'unique', 'first', 'last', - 'top', 'freq']) - # just objects, no datime - if pd.isnull(result.loc['first']).all(): - result = result.drop(['first', 'last'], axis=0) - return result - else: - stat_index = (['count', 'mean', 'std', 'min'] + - [pretty_name(x) for x in percentiles] + - ['max']) - if data.ndim == 1: - return pd.Series(describe_numeric_1d(data, percentiles), - index=stat_index) + return describe_categorical_1d(data) + + if self.ndim == 1: + return describe_1d(self, percentiles) + elif (include is None) and (exclude is None): + if len(self._get_numeric_data()._info_axis) > 0: + # when some numerics are found, keep only numerics + data = self.select_dtypes(include=[np.number, np.bool]) else: - destat = [] - for i in range(len(data._info_axis)): # BAD - series = data.iloc[:, i] - destat.append(describe_numeric_1d(series, percentiles)) - - return self._constructor(lmap(list, zip(*destat)), - index=stat_index, - columns=data._info_axis) + data = self + elif include == 'all': + if exclude != None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + data = self + else: + data = self.select_dtypes(include=include, exclude=exclude) + + ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] + # set a convenient order for rows + names = [] + ldesc_indexes = sorted([x.index for x in ldesc], key=len) + for idxnames in ldesc_indexes: + for name in idxnames: + if name not in names: + names.append(name) + d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) + return d _shared_docs['pct_change'] = """ Percent change over given number of periods. @@ -3925,60 +3992,42 @@ def mad(self, axis=None, skipna=None, level=None, **kwargs): return np.abs(demeaned).mean(axis=axis, skipna=skipna) cls.mad = mad - @Substitution(outname='variance', - desc="Return unbiased variance over requested " - "axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('var', axis=axis, level=level, - skipna=skipna, ddof=ddof) - - return self._reduce(nanops.nanvar, axis=axis, skipna=skipna, - ddof=ddof) - cls.var = var + def _make_stat_function_ddof(name, desc, f): - @Substitution(outname='stdev', - desc="Return unbiased standard deviation over requested " - "axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('std', axis=axis, level=level, - skipna=skipna, ddof=ddof) - result = self.var(axis=axis, skipna=skipna, ddof=ddof) - if getattr(result, 'ndim', 0) > 0: - return result.apply(np.sqrt) - return np.sqrt(result) - cls.std = std - - @Substitution(outname='standarderror', - desc="Return unbiased standard error of the mean over " - "requested axis.\n\nNormalized by N-1 by default. " - "This can be changed using the ddof argument") - @Appender(_num_doc) - def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level('sem', axis=axis, level=level, - skipna=skipna, ddof=ddof) + @Substitution(outname=name, desc=desc) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, ddof=1, + **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, ddof=ddof) + return self._reduce(f, axis=axis, + skipna=skipna, ddof=ddof) + stat_func.__name__ = name + return stat_func - return self._reduce(nanops.nansem, axis=axis, skipna=skipna, - ddof=ddof) - cls.sem = sem + cls.sem = _make_stat_function_ddof( + 'sem', + "Return unbiased standard error of the mean over " + "requested axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nansem) + cls.var = _make_stat_function_ddof( + 'var', + "Return unbiased variance over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nanvar) + cls.std = _make_stat_function_ddof( + 'std', + "Return unbiased standard deviation over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument", + nanops.nanstd) @Substitution(outname='compounded', desc="Return the compound percentage of the values for " diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ce57a9c03d570..a141d8cebfd8e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,7 +18,7 @@ from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel -from pandas.util.decorators import cache_readonly, Appender +from pandas.util.decorators import cache_readonly, Appender, make_signature import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.common import(_possibly_downcast_to_dtype, isnull, @@ -147,10 +147,7 @@ def _last(x): def _count_compat(x, axis=0): - try: - return x.size - except: - return x.count() + return x.count() # .size != .count(); count excludes nan class Grouper(object): """ @@ -251,7 +248,7 @@ def _set_grouper(self, obj, sort=False): key = self.key if key not in obj._info_axis: raise KeyError("The grouper name {0} is not found".format(key)) - ax = Index(obj[key],name=key) + ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) @@ -261,18 +258,12 @@ def _set_grouper(self, obj, sort=False): # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): - - if isinstance(level, compat.string_types): - if obj.index.name != level: - raise ValueError('level name %s is not the name of the ' - 'index' % level) - elif level > 0: - raise ValueError('level > 0 only valid with MultiIndex') - ax = Index(ax.get_level_values(level), name=level) + level = ax._get_level_number(level) + ax = Index(ax.get_level_values(level), name=ax.names[level]) else: - if not (level == 0 or level == ax.name): - raise ValueError("The grouper level {0} is not valid".format(level)) + if level not in (0, ax.name): + raise ValueError("The level {0} is not valid".format(level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: @@ -431,9 +422,18 @@ def convert(key, s): sample = next(iter(self.indices)) if isinstance(sample, tuple): if not isinstance(name, tuple): - raise ValueError("must supply a tuple to get_group with multiple grouping keys") + msg = ("must supply a tuple to get_group with multiple" + " grouping keys") + raise ValueError(msg) if not len(name) == len(sample): - raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys") + try: + # If the original grouper was a tuple + return self.indices[name] + except KeyError: + # turns out it wasn't a tuple + msg = ("must supply a a same-length tuple to get_group" + " with multiple grouping keys") + raise ValueError(msg) name = tuple([ convert(n, k) for n, k in zip(name,sample) ]) @@ -473,7 +473,24 @@ def _set_selection_from_grouper(self): ax = self.obj._info_axis groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ] if len(groupers): - self._group_selection = (ax-Index(groupers)).tolist() + self._group_selection = ax.difference(Index(groupers)).tolist() + + def _set_result_index_ordered(self, result): + # set the result index on the passed values object + # return the new object + # related 8046 + + # the values/counts are repeated according to the group index + indices = self.indices + + # shortcut of we have an already ordered grouper + if not self.grouper.is_monotonic: + index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) + result.index = index + result = result.sort_index() + + result.index = self.obj.index + return result def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) @@ -515,7 +532,7 @@ def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument kwargs_with_axis = kwargs.copy() - if 'axis' not in kwargs_with_axis: + if 'axis' not in kwargs_with_axis or kwargs_with_axis['axis']==None: kwargs_with_axis['axis'] = self.axis def curried_with_axis(x): @@ -755,12 +772,21 @@ def ohlc(self): def nth(self, n, dropna=None): """ - Take the nth row from each group. + Take the nth row from each group if n is an int, or a subset of rows + if n is a list of ints. - If dropna, will not show nth non-null row, dropna is either + If dropna, will take the nth non-null row, dropna is either Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent to calling dropna(how=dropna) before the groupby. + Parameters + ---------- + n : int or list of ints + a single nth value for the row or a list of nth values + dropna : None or str, optional + apply the specified dropna operation before counting which row is + the nth row. Needs to be None, 'any' or 'all' + Examples -------- >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -788,19 +814,36 @@ def nth(self, n, dropna=None): 5 NaN """ + if isinstance(n, int): + nth_values = [n] + elif isinstance(n, (set, list, tuple)): + nth_values = list(set(n)) + if dropna is not None: + raise ValueError("dropna option with a list of nth values is not supported") + else: + raise TypeError("n needs to be an int or a list/set/tuple of ints") + + m = self.grouper._max_groupsize + # filter out values that are outside [-m, m) + pos_nth_values = [i for i in nth_values if i >= 0 and i < m] + neg_nth_values = [i for i in nth_values if i < 0 and i >= -m] self._set_selection_from_grouper() if not dropna: # good choice - m = self.grouper._max_groupsize - if n >= m or n < -m: + if not pos_nth_values and not neg_nth_values: + # no valid nth values return self._selected_obj.loc[[]] + rng = np.zeros(m, dtype=bool) - if n >= 0: - rng[n] = True - is_nth = self._cumcount_array(rng) - else: - rng[- n - 1] = True - is_nth = self._cumcount_array(rng, ascending=False) + for i in pos_nth_values: + rng[i] = True + is_nth = self._cumcount_array(rng) + + if neg_nth_values: + rng = np.zeros(m, dtype=bool) + for i in neg_nth_values: + rng[- i - 1] = True + is_nth |= self._cumcount_array(rng, ascending=False) result = self._selected_obj[is_nth] @@ -875,8 +918,8 @@ def cumcount(self, **kwargs): ascending : bool, default True If False, number in reverse, from length of group - 1 to 0. - Example - ------- + Examples + -------- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], ... columns=['A']) @@ -920,8 +963,8 @@ def head(self, n=5): Essentially equivalent to ``.apply(lambda x: x.head(n))``, except ignores as_index flag. - Example - ------- + Examples + -------- >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) @@ -947,8 +990,8 @@ def tail(self, n=5): Essentially equivalent to ``.apply(lambda x: x.tail(n))``, except ignores as_index flag. - Example - ------- + Examples + -------- >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) @@ -1304,6 +1347,11 @@ def groups(self): to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby.values) + @cache_readonly + def is_monotonic(self): + # return if my group orderings are monotonic + return Index(self.group_info[0]).is_monotonic + @cache_readonly def group_info(self): comp_ids, obs_group_ids = self._get_compressed_labels() @@ -1474,14 +1522,15 @@ def aggregate(self, values, how, axis=0): result = self._aggregate(result, counts, values, how, is_numeric) - if self._filter_empty_groups: + if self._filter_empty_groups and not counts.all(): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - result, (counts > 0).view(np.uint8)) + com._ensure_object(result), + (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -1532,7 +1581,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() - indexer = _algos.groupsort_indexer(group_index, ngroups)[0] + indexer = _get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer, convert=False) group_index = com.take_nd(group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, @@ -1694,6 +1743,11 @@ def indices(self): i = bin return indices + @cache_readonly + def group_info(self): + # for compat + return self.bins, self.binlabels, self.ngroups + @cache_readonly def ngroups(self): return len(self.binlabels) @@ -1879,7 +1933,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = np.asarray(factor) self._labels = factor.codes - self._group_index = factor.levels + self._group_index = factor.categories if self.name is None: self.name = factor.name @@ -1895,6 +1949,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # no level passed if not isinstance(self.grouper, (Series, Index, np.ndarray)): + if getattr(self.grouper,'ndim', 1) != 1: + t = self.name or str(type(self.grouper)) + raise ValueError("Grouper for '%s' not 1-dimensional" % t) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): @@ -2053,8 +2110,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): gpr = obj[gpr] if isinstance(gpr, Categorical) and len(gpr) != len(obj): - errmsg = "Categorical grouper must have len(grouper) == len(data)" - raise AssertionError(errmsg) + raise ValueError("Categorical grouper must have len(grouper) == len(data)") ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort) groupings.append(ping) @@ -2087,9 +2143,72 @@ def _convert_grouper(axis, grouper): else: return grouper +def _whitelist_method_generator(klass, whitelist) : + """ + Yields all GroupBy member defs for DataFrame/Series names in _whitelist. + + Parameters + ---------- + klass - class where members are defined. Should be Series or DataFrame + + whitelist - list of names of klass methods to be constructed + + Returns + ------- + The generator yields a sequence of strings, each suitable for exec'ing, + that define implementations of the named methods for DataFrameGroupBy + or SeriesGroupBy. + + Since we don't want to override methods explicitly defined in the + base class, any such name is skipped. + """ + + method_wrapper_template = \ + """def %(name)s(%(sig)s) : + \""" + %(doc)s + \""" + f = %(self)s.__getattr__('%(name)s') + return f(%(args)s)""" + property_wrapper_template = \ + """@property +def %(name)s(self) : + \""" + %(doc)s + \""" + return self.__getattr__('%(name)s')""" + for name in whitelist : + # don't override anything that was explicitly defined + # in the base class + if hasattr(GroupBy,name) : + continue + # ugly, but we need the name string itself in the method. + f = getattr(klass,name) + doc = f.__doc__ + doc = doc if type(doc)==str else '' + if type(f) == types.MethodType : + wrapper_template = method_wrapper_template + decl, args = make_signature(f) + # pass args by name to f because otherwise + # GroupBy._make_wrapper won't know whether + # we passed in an axis parameter. + args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]] + params = {'name':name, + 'doc':doc, + 'sig':','.join(decl), + 'self':args[0], + 'args':','.join(args_by_name)} + else : + wrapper_template = property_wrapper_template + params = {'name':name, 'doc':doc} + yield wrapper_template % params class SeriesGroupBy(GroupBy): + # + # Make class defs of attributes on SeriesGroupBy whitelist _apply_whitelist = _series_apply_whitelist + for _def_str in _whitelist_method_generator(Series,_series_apply_whitelist) : + exec(_def_str) def aggregate(self, func_or_funcs, *args, **kwargs): """ @@ -2319,18 +2438,7 @@ def _transform_fast(self, func): counts = self.count().values values = np.repeat(values, com._ensure_platform_int(counts)) - # the values/counts are repeated according to the group index - indices = self.indices - - # shortcut of we have an already ordered grouper - if Index(self.grouper.group_info[0]).is_monotonic: - result = Series(values, index=self.obj.index) - else: - index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) - result = Series(values, index=index).sort_index() - result.index = self.obj.index - - return result + return self._set_result_index_ordered(Series(values)) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -2344,8 +2452,8 @@ def filter(self, func, dropna=True, *args, **kwargs): dropna : Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. - Example - ------- + Examples + -------- >>> grouped.filter(lambda x: x.mean() > 0) Returns @@ -2436,7 +2544,7 @@ def _cython_agg_blocks(self, how, numeric_only=True): values = block._try_operate(block.values) if block.is_numeric: - values = com.ensure_float(values) + values = _algos.ensure_float64(values) result, _ = self.grouper.aggregate(values, how, axis=agg_axis) @@ -2743,8 +2851,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return concat(values) if not all_indexed_same: + # GH 8467 return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same + keys, values, not_indexed_same=True, ) try: @@ -2842,8 +2951,7 @@ def _transform_general(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index concatenated = concat(applied, join_axes=[concat_index], axis=self.axis, verify_integrity=False) - concatenated.sort_index(inplace=True) - return concatenated + return self._set_result_index_ordered(concatenated) def transform(self, func, *args, **kwargs): """ @@ -2895,8 +3003,9 @@ def transform(self, func, *args, **kwargs): type(result.index) != type(obj.index)) or len(result.index) != len(obj.index)): results = obj.values.copy() + indices = self.indices for (name, group), (i, row) in zip(self, result.iterrows()): - indexer = self._get_index(name) + indexer = indices[name] results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1) return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() @@ -2975,7 +3084,7 @@ def filter(self, func, dropna=True, *args, **kwargs): Each subframe is endowed the attribute 'name' in case you need to know which group you are working on. - Example + Examples -------- >>> grouped = df.groupby(lambda x: mapping[x]) >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0) @@ -3012,6 +3121,10 @@ def filter(self, func, dropna=True, *args, **kwargs): class DataFrameGroupBy(NDFrameGroupBy): _apply_whitelist = _dataframe_apply_whitelist + # + # Make class defs of attributes on DataFrameGroupBy whitelist. + for _def_str in _whitelist_method_generator(DataFrame,_apply_whitelist) : + exec(_def_str) _block_agg_axis = 1 @@ -3130,7 +3243,8 @@ def _reindex_output(self, result): levels_list = [ ping._group_index for ping in groupings ] index = MultiIndex.from_product(levels_list, names=self.grouper.names) - return result.reindex(**{ self.obj._get_axis_name(self.axis) : index, 'copy' : False }).sortlevel() + d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False } + return result.reindex(**d).sortlevel(axis=self.axis) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): @@ -3257,7 +3371,7 @@ def slabels(self): @cache_readonly def sort_idx(self): # Counting sort indexer - return _algos.groupsort_indexer(self.labels, self.ngroups)[0] + return _get_group_index_sorter(self.labels, self.ngroups) def __iter__(self): sdata = self._get_sorted_data() @@ -3417,13 +3531,7 @@ def _indexer_from_factorized(labels, shape, compress=True): comp_ids = group_index max_group = com._long_prod(shape) - if max_group > 1e6: - # Use mergesort to avoid memory errors in counting sort - indexer = comp_ids.argsort(kind='mergesort') - else: - indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64), - max_group) - + indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group) return indexer @@ -3448,7 +3556,7 @@ def _lexsort_indexer(keys, orders=None, na_position='last'): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - n = len(c.levels) + n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) @@ -3529,16 +3637,14 @@ def get_key(self, comp_id): def _get_indices_dict(label_list, keys): - shape = [len(x) for x in keys] - group_index = get_group_index(label_list, shape) + shape = list(map(len, keys)) + ngroups = np.prod(shape) - sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), - np.prod(shape)) - - sorter_int = com._ensure_platform_int(sorter) + group_index = get_group_index(label_list, shape) + sorter = _get_group_index_sorter(group_index, ngroups) - sorted_labels = [lab.take(sorter_int) for lab in label_list] - group_index = group_index.take(sorter_int) + sorted_labels = [lab.take(sorter) for lab in label_list] + group_index = group_index.take(sorter) return lib.indices_fast(sorter, group_index, keys, sorted_labels) @@ -3546,6 +3652,30 @@ def _get_indices_dict(label_list, keys): #---------------------------------------------------------------------- # sorting levels...cleverly? +def _get_group_index_sorter(group_index, ngroups): + """ + _algos.groupsort_indexer implements `counting sort` and it is at least + O(ngroups), where + ngroups = prod(shape) + shape = map(len, keys) + that is, linear in the number of combinations (cartesian product) of unique + values of groupby keys. This can be huge when doing multi-key groupby. + np.argsort(kind='mergesort') is O(count x log(count)) where count is the + length of the data-frame; + Both algorithms are `stable` sort and that is necessary for correctness of + groupby operations. e.g. consider: + df.groupby(key)[col].transform('first') + """ + count = len(group_index) + alpha = 0.0 # taking complexities literally; there may be + beta = 1.0 # some room for fine-tuning these parameters + if alpha + beta * ngroups < count * np.log(count): + sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), + ngroups) + return com._ensure_platform_int(sorter) + else: + return group_index.argsort(kind='mergesort') + def _compress_group_index(group_index, sort=True): """ @@ -3588,12 +3718,16 @@ def _reorder_by_uniques(uniques, labels): _func_table = { - builtins.sum: np.sum + builtins.sum: np.sum, + builtins.max: np.max, + builtins.min: np.min } _cython_table = { builtins.sum: 'sum', + builtins.max: 'max', + builtins.min: 'min', np.sum: 'sum', np.mean: 'mean', np.prod: 'prod', diff --git a/pandas/core/index.py b/pandas/core/index.py index 4bfeb86cd84c0..c2c7e28a7a7f4 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -11,7 +11,7 @@ import pandas.lib as lib import pandas.algos as _algos import pandas.index as _index -from pandas.lib import Timestamp, is_datetime_array +from pandas.lib import Timestamp, Timedelta, is_datetime_array from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs from pandas.util.decorators import Appender, cache_readonly, deprecate from pandas.core.common import isnull, array_equivalent @@ -136,23 +136,28 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, else: return result elif issubclass(data.dtype.type, np.timedelta64): - return Int64Index(data, copy=copy, name=name) + from pandas.tseries.tdi import TimedeltaIndex + result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) + if dtype is not None and _o_dtype == dtype: + return Index(result.to_pytimedelta(), dtype=_o_dtype) + else: + return result if dtype is not None: try: data = np.array(data, dtype=dtype, copy=copy) except TypeError: pass - elif isinstance(data, PeriodIndex): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + # maybe coerce to a sub-class + if isinstance(data, PeriodIndex): + return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): return Int64Index(data, copy=copy, dtype=dtype, name=name) - if issubclass(data.dtype.type, np.floating): + elif issubclass(data.dtype.type, np.floating): return Float64Index(data, copy=copy, dtype=dtype, name=name) - - if com.is_bool_dtype(data): - subarr = data + elif issubclass(data.dtype.type, np.bool) or com.is_bool_dtype(data): + subarr = data.astype('object') else: subarr = com._asarray_tuplesafe(data, dtype=object) @@ -196,6 +201,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tslib.is_timestamp_array(subarr)): from pandas.tseries.index import DatetimeIndex return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) + elif (inferred.startswith('timedelta') or + lib.is_timedelta_array(subarr)): + from pandas.tseries.tdi import TimedeltaIndex + return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': return PeriodIndex(subarr, name=name, **kwargs) @@ -211,7 +220,7 @@ def _simple_new(cls, values, name=None, **kwargs): result._reset_identity() return result - def _update_inplace(self, result): + def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") @@ -398,27 +407,25 @@ def __unicode__(self): quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) - def to_series(self, keep_tz=False): + def to_series(self, **kwargs): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index - Parameters - ---------- - keep_tz : optional, defaults False. - applies only to a DatetimeIndex - Returns ------- Series : dtype will be based on the type of the Index values. """ - import pandas as pd - values = self._to_embed(keep_tz) - return pd.Series(values, index=self, name=self.name) + from pandas import Series + return Series(self._to_embed(), index=self, name=self.name) def _to_embed(self, keep_tz=False): - """ return an array repr of this object, potentially casting to object """ + """ + return an array repr of this object, potentially casting to object + + This is for internal compat + """ return self.values def astype(self, dtype): @@ -576,6 +583,9 @@ def is_unique(self): """ return if the index has unique values """ return self._engine.is_unique + def is_boolean(self): + return self.inferred_type in ['boolean'] + def is_integer(self): return self.inferred_type in ['integer'] @@ -585,6 +595,9 @@ def is_floating(self): def is_numeric(self): return self.inferred_type in ['integer', 'floating'] + def is_object(self): + return self.dtype == np.object_ + def is_mixed(self): return 'mixed' in self.inferred_type @@ -836,6 +849,7 @@ def __setstate__(self, state): np.ndarray.__setstate__(data, state) self._data = data + self._reset_identity() else: raise Exception("invalid pickle state") _unpickle_compat = __setstate__ @@ -931,8 +945,8 @@ def append(self, other): @staticmethod def _ensure_compat_concat(indexes): - from pandas.tseries.api import DatetimeIndex, PeriodIndex - klasses = DatetimeIndex, PeriodIndex + from pandas.tseries.api import DatetimeIndex, PeriodIndex, TimedeltaIndex + klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex is_ts = [isinstance(idx, klasses) for idx in indexes] @@ -1033,7 +1047,7 @@ def equals(self, other): if type(other) != Index: return other.equals(self) - return array_equivalent(self, other) + return array_equivalent(_values_from_object(self), _values_from_object(other)) def identical(self, other): """Similar to equals, but check that other comparable attributes are @@ -1052,6 +1066,9 @@ def asof(self, label): if isinstance(label, (Index, ABCSeries, np.ndarray)): raise TypeError('%s' % type(label)) + if not isinstance(label, Timestamp): + label = Timestamp(label) + if label not in self: loc = self.searchsorted(label, side='left') if loc > 0: @@ -1059,8 +1076,6 @@ def asof(self, label): else: return np.nan - if not isinstance(label, Timestamp): - label = Timestamp(label) return label def asof_locs(self, where, mask): @@ -1128,9 +1143,10 @@ def argsort(self, *args, **kwargs): def __add__(self, other): if isinstance(other, Index): + warnings.warn("using '+' to provide set union with Indexes is deprecated, " + "use '|' or .union()",FutureWarning) return self.union(other) - else: - return Index(np.array(self) + other) + return Index(np.array(self) + other) __iadd__ = __add__ __eq__ = _indexOp('__eq__') @@ -1141,7 +1157,10 @@ def __add__(self, other): __ge__ = _indexOp('__ge__') def __sub__(self, other): - return self.diff(other) + if isinstance(other, Index): + warnings.warn("using '-' to provide set differences with Indexes is deprecated, " + "use .difference()",FutureWarning) + return self.difference(other) def __and__(self, other): return self.intersection(other) @@ -1267,13 +1286,14 @@ def intersection(self, other): except: # duplicates indexer = self.get_indexer_non_unique(other.values)[0].unique() + indexer = indexer[indexer != -1] taken = self.take(indexer) if self.name != other.name: taken.name = None return taken - def diff(self, other): + def difference(self, other): """ Compute sorted set difference of two Index objects @@ -1289,8 +1309,7 @@ def diff(self, other): ----- One can do either of these and achieve the same result - >>> index - index2 - >>> index.diff(index2) + >>> index.difference(index2) """ if not hasattr(other, '__iter__'): @@ -1308,6 +1327,8 @@ def diff(self, other): theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) + diff = deprecate('diff',difference) + def sym_diff(self, other, result_name=None): """ Compute the sorted symmetric difference of two Index objects. @@ -1350,7 +1371,7 @@ def sym_diff(self, other, result_name=None): other = Index(other) result_name = result_name or self.name - the_diff = sorted(set((self - other) + (other - self))) + the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) def get_loc(self, key): @@ -1563,34 +1584,38 @@ def _get_method(self, method): } return aliases.get(method, method) - def reindex(self, target, method=None, level=None, limit=None, - copy_if_needed=False): + def reindex(self, target, method=None, level=None, limit=None): """ - For Index, simply returns the new index and the results of - get_indexer. Provided here to enable an interface that is amenable for - subclasses of Index whose internals are different (like MultiIndex) + Create index with target's values (move/add/delete values as necessary) Returns ------- - (new_index, indexer, mask) : tuple - """ - target = _ensure_index(target) + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'name') + + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) + else: + target = _ensure_index(target) if level is not None: if method is not None: raise TypeError('Fill method not supported if level passed') _, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) else: - if self.equals(target): indexer = None - - # to avoid aliasing an existing index - if (copy_if_needed and target.name != self.name and - self.name is not None): - if target.name is None: - target = self.copy() - else: if self.is_unique: indexer = self.get_indexer(target, method=method, @@ -1601,6 +1626,10 @@ def reindex(self, target, method=None, level=None, limit=None, "with a method or limit") indexer, missing = self.get_indexer_non_unique(target) + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name + return target, indexer def join(self, other, how='left', level=None, return_indexers=False): @@ -2037,22 +2066,33 @@ def drop_duplicates(self, take_last=False): def duplicated(self, take_last=False): return super(Index, self).duplicated(take_last=take_last) + + def _evaluate_with_timedelta_like(self, other, op, opstr): + raise TypeError("can only perform ops with timedelta like values") + + def _evaluate_with_datetime_like(self, other, op, opstr): + raise TypeError("can only perform ops with datetime like values") + @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable """ def _make_invalid_op(opstr): - def _invalid_op(self, other): + def _invalid_op(self, other=None): raise TypeError("cannot perform {opstr} with this index type: {typ}".format(opstr=opstr, - typ=type(self))) + typ=type(self))) return _invalid_op - cls.__mul__ = cls.__rmul__ = _make_invalid_op('multiplication') - cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('floor division') - cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('true division') + cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__') + cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__') + cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('__truediv__') if not compat.PY3: - cls.__div__ = cls.__rdiv__ = _make_invalid_op('division') + cls.__div__ = cls.__rdiv__ = _make_invalid_op('__div__') + cls.__neg__ = _make_invalid_op('__neg__') + cls.__pos__ = _make_invalid_op('__pos__') + cls.__abs__ = _make_invalid_op('__abs__') + cls.__inv__ = _make_invalid_op('__inv__') @classmethod def _add_numeric_methods(cls): @@ -2061,6 +2101,7 @@ def _add_numeric_methods(cls): def _make_evaluate_binop(op, opstr): def _evaluate_numeric_binop(self, other): + import pandas.tseries.offsets as offsets # if we are an inheritor of numeric, but not actually numeric (e.g. DatetimeIndex/PeriodInde) if not self._is_numeric_dtype: @@ -2080,6 +2121,10 @@ def _evaluate_numeric_binop(self, other): other = _values_from_object(other) if other.dtype.kind not in ['f','i']: raise TypeError("cannot evaluate a numeric op with a non-numeric dtype") + elif isinstance(other, (offsets.DateOffset, np.timedelta64, Timedelta, datetime.timedelta)): + return self._evaluate_with_timedelta_like(other, op, opstr) + elif isinstance(other, (Timestamp, np.datetime64)): + return self._evaluate_with_datetime_like(other, op, opstr) else: if not (com.is_float(other) or com.is_integer(other)): raise TypeError("can only perform ops with scalar values") @@ -2087,12 +2132,29 @@ def _evaluate_numeric_binop(self, other): return _evaluate_numeric_binop + def _make_evaluate_unary(op, opstr): - cls.__mul__ = cls.__rmul__ = _make_evaluate_binop(operator.mul,'multiplication') - cls.__floordiv__ = cls.__rfloordiv__ = _make_evaluate_binop(operator.floordiv,'floor division') - cls.__truediv__ = cls.__rtruediv__ = _make_evaluate_binop(operator.truediv,'true division') + def _evaluate_numeric_unary(self): + + # if we are an inheritor of numeric, but not actually numeric (e.g. DatetimeIndex/PeriodInde) + if not self._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op {opstr} for type: {typ}".format(opstr=opstr, + typ=type(self))) + + return self._shallow_copy(op(self.values)) + + return _evaluate_numeric_unary + + cls.__mul__ = cls.__rmul__ = _make_evaluate_binop(operator.mul,'__mul__') + cls.__floordiv__ = cls.__rfloordiv__ = _make_evaluate_binop(operator.floordiv,'__floordiv__') + cls.__truediv__ = cls.__rtruediv__ = _make_evaluate_binop(operator.truediv,'__truediv__') if not compat.PY3: - cls.__div__ = cls.__rdiv__ = _make_evaluate_binop(operator.div,'division') + cls.__div__ = cls.__rdiv__ = _make_evaluate_binop(operator.div,'__div__') + cls.__neg__ = _make_evaluate_unary(lambda x: -x,'__neg__') + cls.__pos__ = _make_evaluate_unary(lambda x: x,'__pos__') + cls.__abs__ = _make_evaluate_unary(lambda x: np.abs(x),'__abs__') + cls.__inv__ = _make_evaluate_unary(lambda x: -x,'__inv__') + Index._add_numeric_methods_disabled() class NumericIndex(Index): @@ -2198,7 +2260,7 @@ def equals(self, other): # return False try: - return array_equivalent(self, other) + return array_equivalent(_values_from_object(self), _values_from_object(other)) except TypeError: # e.g. fails in numpy 1.6 with DatetimeIndex #1681 return False @@ -2309,7 +2371,8 @@ def get_value(self, series, key): k = _values_from_object(key) loc = self.get_loc(k) - new_values = series.values[loc] + new_values = _values_from_object(series)[loc] + if np.isscalar(new_values) or new_values is None: return new_values @@ -2874,10 +2937,16 @@ def values(self): values = [] for lev, lab in zip(self.levels, self.labels): - taken = com.take_1d(lev.values, lab) # Need to box timestamps, etc. - if hasattr(lev, '_box_values'): - taken = lev._box_values(taken) + box = hasattr(lev, '_box_values') + # Try to minimize boxing. + if box and len(lev) > len(lab): + taken = lev._box_values(com.take_1d(lev.values, lab)) + elif box: + taken = com.take_1d(lev._box_values(lev.values), lab, + fill_value=_get_na_value(lev.dtype.type)) + else: + taken = com.take_1d(lev.values, lab) values.append(taken) self._tuples = lib.fast_zip(values) @@ -3159,7 +3228,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): return Index(arrays[0], name=name) cats = [Categorical.from_array(arr) for arr in arrays] - levels = [c.levels for c in cats] + levels = [c.categories for c in cats] labels = [c.codes for c in cats] if names is None: names = [c.name for c in cats] @@ -3254,7 +3323,7 @@ def from_product(cls, iterables, sortorder=None, names=None): categoricals = [Categorical.from_array(it) for it in iterables] labels = cartesian_product([c.codes for c in categoricals]) - return MultiIndex(levels=[c.levels for c in categoricals], + return MultiIndex(levels=[c.categories for c in categoricals], labels=labels, sortorder=sortorder, names=names) @property @@ -3276,8 +3345,8 @@ def __contains__(self, key): def __reduce__(self): """Necessary for making this object picklable""" - d = dict(levels = [lev.view(np.ndarray) for lev in self.levels], - labels = [label.view(np.ndarray) for label in self.labels], + d = dict(levels = [lev for lev in self.levels], + labels = [label for label in self.labels], sortorder = self.sortorder, names = list(self.names)) return _new_Index, (self.__class__, d), None @@ -3301,6 +3370,7 @@ def __setstate__(self, state): self._set_names(names) self.sortorder = sortorder self._verify_integrity() + self._reset_identity() def __getitem__(self, key): if np.isscalar(key): @@ -3636,22 +3706,36 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) - def reindex(self, target, method=None, level=None, limit=None, - copy_if_needed=False): + def reindex(self, target, method=None, level=None, limit=None): """ - Performs any necessary conversion on the input index and calls - get_indexer. This method is here so MultiIndex and an Index of - like-labeled tuples can play nice together + Create index with target's values (move/add/delete values as necessary) Returns ------- - (new_index, indexer, mask) : (MultiIndex, ndarray, ndarray) + new_index : pd.MultiIndex + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'names') if level is not None: if method is not None: raise TypeError('Fill method not supported if level passed') - target = _ensure_index(target) + + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator + if len(target) == 0 and not isinstance(target, Index): + idx = self.levels[level] + attrs = idx._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), + **attrs) + else: + target = _ensure_index(target) target, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) else: @@ -3674,6 +3758,11 @@ def reindex(self, target, method=None, level=None, limit=None, # hopefully? target = MultiIndex.from_tuples(target) + if (preserve_names and target.nlevels == self.nlevels and + target.names != self.names): + target = target.copy(deep=False) + target.names = self.names + return target, indexer @cache_readonly @@ -3926,9 +4015,21 @@ def _get_level_indexer(self, key, level=0): # handle a slice, returnig a slice if we can # otherwise a boolean indexer - start = level_index.get_loc(key.start or 0) - stop = level_index.get_loc(key.stop or len(level_index)-1) - step = key.step + try: + if key.start is not None: + start = level_index.get_loc(key.start) + else: + start = 0 + if key.stop is not None: + stop = level_index.get_loc(key.stop) + else: + stop = len(level_index)-1 + step = key.step + except (KeyError): + + # we have a partial slice (like looking up a partial date string) + start = stop = level_index.slice_indexer(key.start, key.stop, key.step) + step = start.step if isinstance(start,slice) or isinstance(stop,slice): # we have a slice for start and/or stop @@ -4080,7 +4181,8 @@ def equals(self, other): return True if not isinstance(other, MultiIndex): - return array_equivalent(self.values, _ensure_index(other)) + return array_equivalent(self.values, + _values_from_object(_ensure_index(other))) if self.nlevels != other.nlevels: return False @@ -4122,6 +4224,8 @@ def union(self, other): Returns ------- Index + + >>> index.union(index2) """ self._assert_can_do_setop(other) @@ -4164,7 +4268,7 @@ def intersection(self, other): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def diff(self, other): + def difference(self, other): """ Compute sorted set difference of two MultiIndex objects @@ -4469,8 +4573,8 @@ def _get_consensus_names(indexes): def _maybe_box(idx): - from pandas.tseries.api import DatetimeIndex, PeriodIndex - klasses = DatetimeIndex, PeriodIndex + from pandas.tseries.api import DatetimeIndex, PeriodIndex, TimedeltaIndex + klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex if isinstance(idx, klasses): return idx.asobject @@ -4492,3 +4596,13 @@ def _get_na_rep(dtype): def _get_na_value(dtype): return {np.datetime64: tslib.NaT, np.timedelta64: tslib.NaT}.get(dtype, np.nan) + + +def _ensure_has_len(seq): + """If seq is an iterator, put its values into a list.""" + try: + len(seq) + except TypeError: + return list(seq) + else: + return seq diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index dfc552e8df0d7..6d002bc8d633a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -303,12 +303,27 @@ def _setitem_with_indexer(self, indexer, value): "cannot set a frame with no defined columns" ) - index = self.obj._get_axis(0) - labels = _safe_append_to_index(index, indexer) - self.obj._data = self.obj.reindex_axis(labels, 0)._data + # append a Series + if isinstance(value, Series): + + value = value.reindex(index=self.obj.columns,copy=True) + value.name = indexer + + # a list-list + else: + + # must have conforming columns + if com.is_list_like(value): + if len(value) != len(self.obj.columns): + raise ValueError( + "cannot set a row with mismatched columns" + ) + + value = Series(value,index=self.obj.columns,name=indexer) + + self.obj._data = self.obj.append(value)._data self.obj._maybe_update_cacher(clear=True) - return getattr(self.obj, self.name).__setitem__(indexer, - value) + return self.obj # set using setitem (Panel and > dims) elif self.ndim >= 3: @@ -424,16 +439,10 @@ def can_do_equal_len(): if isinstance(value, ABCDataFrame) and value.ndim > 1: for item in labels: - # align to - if item in value: - v = value[item] - i = self.obj[item].index - v = v.reindex(i & v.index) - - setter(item, v.values) - else: - setter(item, np.nan) + v = np.nan if item not in value else \ + self._align_series(indexer[0], value[item]) + setter(item, v) # we have an equal len ndarray/convertible to our labels elif np.array(value).ndim == 2: @@ -496,6 +505,10 @@ def _align_series(self, indexer, ser): if isinstance(indexer, tuple): + # flatten np.ndarray indexers + ravel = lambda i: i.ravel() if isinstance(i, np.ndarray) else i + indexer = tuple(map(ravel, indexer)) + aligners = [not _is_null_slice(idx) for idx in indexer] sum_aligners = sum(aligners) single_aligner = sum_aligners == 1 @@ -521,12 +534,11 @@ def _align_series(self, indexer, ser): # series, so need to broadcast (see GH5206) if (sum_aligners == self.ndim and all([com._is_sequence(_) for _ in indexer])): - ser = ser.reindex(obj.axes[0][indexer[0].ravel()], - copy=True).values + ser = ser.reindex(obj.axes[0][indexer[0]], copy=True).values # single indexer if len(indexer) > 1: - l = len(indexer[1].ravel()) + l = len(indexer[1]) ser = np.tile(ser, l).reshape(l, -1).T return ser @@ -542,7 +554,7 @@ def _align_series(self, indexer, ser): if not is_list_like(new_ix): new_ix = Index([new_ix]) else: - new_ix = Index(new_ix.ravel()) + new_ix = Index(new_ix) if ser.index.equals(new_ix) or not len(new_ix): return ser.values.copy() @@ -594,7 +606,13 @@ def _align_series(self, indexer, ser): def _align_frame(self, indexer, df): is_frame = self.obj.ndim == 2 is_panel = self.obj.ndim >= 3 + if isinstance(indexer, tuple): + + aligners = [not _is_null_slice(idx) for idx in indexer] + sum_aligners = sum(aligners) + single_aligner = sum_aligners == 1 + idx, cols = None, None sindexers = [] for i, ix in enumerate(indexer): @@ -611,13 +629,21 @@ def _align_frame(self, indexer, df): # panel if is_panel: - if len(sindexers) == 1 and idx is None and cols is None: - if sindexers[0] == 0: - df = df.T - return self.obj.conform(df, axis=sindexers[0]) - df = df.T + + # need to conform to the convention + # as we are not selecting on the items axis + # and we have a single indexer + # GH 7763 + if len(sindexers) == 1 and sindexers[0] != 0: + df = df.T + + if idx is None: + idx = df.index + if cols is None: + cols = df.columns if idx is not None and cols is not None: + if df.index.equals(idx) and df.columns.equals(cols): val = df.copy().values else: @@ -640,21 +666,15 @@ def _align_frame(self, indexer, df): val = df.reindex(index=ax).values return val - elif np.isscalar(indexer) and not is_frame: + elif np.isscalar(indexer) and is_panel: idx = self.obj.axes[1] cols = self.obj.axes[2] # by definition we are indexing on the 0th axis - if is_panel: - df = df.T - - if idx.equals(df.index) and cols.equals(df.columns): - return df.copy().values - # a passed in dataframe which is actually a transpose # of what is needed - elif idx.equals(df.columns) and cols.equals(df.index): - return df.T.copy().values + if idx.equals(df.index) and cols.equals(df.columns): + return df.copy().values return df.reindex(idx, columns=cols).values @@ -1464,7 +1484,7 @@ class _ScalarAccessIndexer(_NDFrameIndexer): """ access scalars quickly """ - def _convert_key(self, key): + def _convert_key(self, key, is_setter=False): return list(key) def __getitem__(self, key): @@ -1485,7 +1505,7 @@ def __setitem__(self, key, value): if len(key) != self.obj.ndim: raise ValueError('Not enough indexers for scalar access ' '(setting)!') - key = list(self._convert_key(key)) + key = list(self._convert_key(key, is_setter=True)) key.append(value) self.obj.set_value(*key, takeable=self._takeable) @@ -1495,6 +1515,23 @@ class _AtIndexer(_ScalarAccessIndexer): """ label based scalar accessor """ _takeable = False + def _convert_key(self, key, is_setter=False): + """ require they keys to be the same type as the index (so we don't fallback) """ + + # allow arbitrary setting + if is_setter: + return list(key) + + for ax, i in zip(self.obj.axes, key): + if ax.is_integer(): + if not com.is_integer(i): + raise ValueError("At based indexing on an integer index can only have integer " + "indexers") + else: + if com.is_integer(i): + raise ValueError("At based indexing on an non-integer index can only have non-integer " + "indexers") + return key class _iAtIndexer(_ScalarAccessIndexer): @@ -1504,7 +1541,7 @@ class _iAtIndexer(_ScalarAccessIndexer): def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) - def _convert_key(self, key): + def _convert_key(self, key, is_setter=False): """ require integer args (and convert to label arguments) """ for a, i in zip(self.obj.axes, key): if not com.is_integer(i): @@ -1730,4 +1767,3 @@ def _maybe_droplevels(index, key): pass return index - diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f3b8a54034d56..9be680d998216 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -13,7 +13,7 @@ ABCSparseSeries, _infer_dtype_from_scalar, _is_null_datelike_scalar, is_timedelta64_dtype, is_datetime64_dtype, - _possibly_infer_to_datetimelike) + _possibly_infer_to_datetimelike, array_equivalent) from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) from pandas.core.categorical import Categorical, _maybe_to_categorical, _is_categorical @@ -24,7 +24,7 @@ import pandas.computation.expressions as expressions from pandas.util.decorators import cache_readonly -from pandas.tslib import Timestamp +from pandas.tslib import Timestamp, Timedelta from pandas import compat from pandas.compat import range, map, zip, u from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type @@ -245,7 +245,7 @@ def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ result = func(self.values) if not isinstance(result, Block): - result = make_block(values=result, placement=self.mgr_locs,) + result = make_block(values=_block_shape(result), placement=self.mgr_locs,) return result @@ -357,6 +357,9 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, return self.copy() return self + if klass is None: + if dtype == np.object_: + klass = ObjectBlock try: # force the copy here if values is None: @@ -494,6 +497,11 @@ def setitem(self, indexer, value): compatible shape """ + # coerce None values, if appropriate + if value is None: + if self.is_numeric: + value = np.nan + # coerce args values, value = self._try_coerce_args(self.values, value) arr_value = np.array(value) @@ -548,9 +556,15 @@ def setitem(self, indexer, value): else: dtype = 'infer' values = self._try_coerce_and_cast_result(values, dtype) - return [make_block(transf(values), + block = make_block(transf(values), ndim=self.ndim, placement=self.mgr_locs, - fastpath=True)] + fastpath=True) + + # may have to soft convert_objects here + if block.is_object and not self.is_object: + block = block.convert(convert_numeric=False) + + return block except (ValueError, TypeError) as detail: raise except Exception as detail: @@ -587,7 +601,7 @@ def putmask(self, mask, new, align=True, inplace=False): mask = mask.values.T # if we are passed a scalar None, convert it here - if not is_list_like(new) and isnull(new): + if not is_list_like(new) and isnull(new) and not self.is_object: new = self.fill_value if self._can_hold_element(new): @@ -817,7 +831,10 @@ def shift(self, periods, axis=0): if f_ordered: new_values = new_values.T axis = new_values.ndim - axis - 1 - new_values = np.roll(new_values, periods, axis=axis) + + if np.prod(new_values.shape): + new_values = np.roll(new_values, com._ensure_platform_int(periods), axis=axis) + axis_indexer = [ slice(None) ] * self.ndim if periods > 0: axis_indexer[axis] = slice(None,periods) @@ -1040,7 +1057,7 @@ def func(c, v, o): def equals(self, other): if self.dtype != other.dtype or self.shape != other.shape: return False - return np.array_equal(self.values, other.values) + return array_equivalent(self.values, other.values) class NonConsolidatableMixIn(object): @@ -1053,16 +1070,19 @@ class NonConsolidatableMixIn(object): def __init__(self, values, placement, ndim=None, fastpath=False,): + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + # kludgetastic if ndim is None: - if len(placement) != 1: + if len(self.mgr_locs) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self.mgr_locs = placement - if not isinstance(values, self._holder): raise TypeError("values must be {0}".format(self._holder.__name__)) @@ -1224,6 +1244,8 @@ def _try_fill(self, value): """ if we are a NaT, return the actual fill value """ if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all(): value = tslib.iNaT + elif isinstance(value, Timedelta): + value = value.value elif isinstance(value, np.timedelta64): pass elif com.is_integer(value): @@ -1249,8 +1271,8 @@ def masker(v): if _is_null_datelike_scalar(other): other = np.nan - elif isinstance(other, np.timedelta64): - other = _coerce_scalar_to_timedelta_type(other, unit='s').item() + elif isinstance(other, (np.timedelta64, Timedelta, timedelta)): + other = _coerce_scalar_to_timedelta_type(other, unit='s', box=False).item() if other == tslib.iNaT: other = np.nan else: @@ -1270,7 +1292,7 @@ def _try_coerce_result(self, result): result = result.astype('m8[ns]') result[mask] = tslib.iNaT elif isinstance(result, np.integer): - result = np.timedelta64(result) + result = lib.Timedelta(result) return result def should_store(self, value): @@ -1289,17 +1311,21 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): na_rep = 'NaT' rvalues[mask] = na_rep imask = (~mask).ravel() - rvalues.flat[imask] = np.array([lib.repr_timedelta64(val) + + #### FIXME #### + # should use the core.format.Timedelta64Formatter here + # to figure what format to pass to the Timedelta + # e.g. to not show the decimals say + rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') for val in values.ravel()[imask]], dtype=object) return rvalues.tolist() def get_values(self, dtype=None): - # return object dtypes as datetime.timedeltas + # return object dtypes as Timedelta if dtype == object: - return lib.map_infer(self.values.ravel(), - lambda x: timedelta(microseconds=x.item() / 1000) + return lib.map_infer(self.values.ravel(), lib.Timedelta ).reshape(self.values.shape) return self.values @@ -1628,6 +1654,27 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) + def putmask(self, mask, new, align=True, inplace=False): + """ putmask the data to the block; it is possible that we may create a + new dtype of block + + return the resulting block(s) + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ + new_values = self.values if inplace else self.values.copy() + new_values[mask] = new + return [self.make_block_same_class(values=new_values, placement=self.mgr_locs)] + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, klass=None): """ @@ -1654,12 +1701,12 @@ def _concat_blocks(self, blocks, values): return the block concatenation """ - levels = self.values.levels + categories = self.values.categories for b in blocks: - if not levels.equals(b.values.levels): + if not categories.equals(b.values.categories): raise ValueError("incompatible levels in categorical block merge") - return self._holder(values[0], levels=levels) + return self._holder(values[0], categories=categories) def to_native_types(self, slicer=None, na_rep='', **kwargs): """ convert to our native types format, slicing if desired """ @@ -1787,16 +1834,6 @@ def to_native_types(self, slicer=None, na_rep=None, date_format=None, def should_store(self, value): return issubclass(value.dtype.type, np.datetime64) - def astype(self, dtype, copy=False, raise_on_error=True): - """ - handle convert to object as a special case - """ - klass = None - if np.dtype(dtype).type == np.object_: - klass = ObjectBlock - return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, - klass=klass) - def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -1818,6 +1855,7 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values + class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ __slots__ = () @@ -1827,27 +1865,6 @@ class SparseBlock(NonConsolidatableMixIn, Block): _ftype = 'sparse' _holder = SparseArray - def __init__(self, values, placement, - ndim=None, fastpath=False,): - - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement - - # kludgetastic - if ndim is None: - if len(self.mgr_locs) != 1: - ndim = 1 - else: - ndim = 2 - self.ndim = ndim - - if not isinstance(values, SparseArray): - raise TypeError("values must be SparseArray") - - self.values = values - @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) @@ -3054,7 +3071,7 @@ def reindex_axis(self, new_index, axis, method=None, limit=None, """ new_index = _ensure_index(new_index) new_index, indexer = self.axes[axis].reindex( - new_index, method=method, limit=limit, copy_if_needed=True) + new_index, method=method, limit=limit) return self.reindex_indexer(new_index, indexer, axis=axis, fill_value=fill_value, copy=copy) @@ -3479,7 +3496,7 @@ def create_block_manager_from_arrays(arrays, names, axes): mgr._consolidate_inplace() return mgr except (ValueError) as e: - construction_error(len(arrays), arrays[0].shape[1:], axes, e) + construction_error(len(arrays), arrays[0].shape, axes, e) def form_blocks(arrays, names, axes): @@ -3890,14 +3907,16 @@ def _putmask_smart(v, m, n): Parameters ---------- - v : array_like - m : array_like - n : array_like + v : `values`, updated in-place (array like) + m : `mask`, applies to both sides (array like) + n : `new values` either scalar or an array like aligned with `values` """ # n should be the length of the mask or a scalar here if not is_list_like(n): n = np.array([n] * len(m)) + elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar + n = np.repeat(np.array(n, ndmin=1), len(m)) # see if we are only masking values that if putted # will work in the current dtype @@ -3915,10 +3934,10 @@ def _putmask_smart(v, m, n): dtype, _ = com._maybe_promote(n.dtype) nv = v.astype(dtype) try: - nv[m] = n + nv[m] = n[m] except ValueError: idx, = np.where(np.squeeze(m)) - for mask_index, new_val in zip(idx, n): + for mask_index, new_val in zip(idx, n[m]): nv[mask_index] = new_val return nv diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index aa6140383a27a..9703dba40a18a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -72,6 +72,10 @@ def f(values, axis=None, skipna=True, **kwds): try: if self.zero_value is not None and values.size == 0: if values.ndim == 1: + + # wrap the 0's if needed + if is_timedelta64_dtype(values): + return lib.Timedelta(0) return 0 else: result_shape = (values.shape[:axis] + @@ -222,19 +226,9 @@ def _wrap_results(result, dtype): result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): - - # this is a scalar timedelta result! - # we have series convert then take the element (scalar) - # as series will do the right thing in py3 (and deal with numpy - # 1.6.2 bug in that it results dtype of timedelta64[us] - from pandas import Series - - # coerce float to results - if is_float(result): - result = int(result) - result = Series([result], dtype='timedelta64[ns]') + result = lib.Timedelta(result) else: - result = result.view(dtype) + result = result.astype('i8').view(dtype) return result @@ -301,7 +295,7 @@ def get_median(x): if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: - return np.apply_along_axis(get_median, axis, values) + return _wrap_results(np.apply_along_axis(get_median, axis, values), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" @@ -311,10 +305,10 @@ def get_median(x): dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) - return ret + return _wrap_results(ret, dtype) # otherwise return a scalar value - return _wrap_results(get_median(values), dtype) if notempty else np.nan + return _wrap_results(get_median(values) if notempty else np.nan, dtype) def _get_counts_nanvar(mask, axis, ddof): @@ -335,14 +329,12 @@ def _get_counts_nanvar(mask, axis, ddof): return count, d -@disallow('M8') -@bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): +def _nanvar(values, axis=None, skipna=True, ddof=1): + # private nanvar calculator + mask = isnull(values) if not _is_floating_dtype(values): values = values.astype('f8') - mask = isnull(values) - count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: @@ -353,13 +345,30 @@ def nanvar(values, axis=None, skipna=True, ddof=1): XX = _ensure_numeric((values ** 2).sum(axis)) return np.fabs((XX - X ** 2 / count) / d) +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanstd(values, axis=None, skipna=True, ddof=1): + + result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + return _wrap_results(result, values.dtype) + +@disallow('M8','m8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): + + # we are going to allow timedelta64[ns] here + # but NOT going to coerce them to the Timedelta type + # as this could cause overflow + # so var cannot be computed (but std can!) + return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof) +@disallow('M8','m8') def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) + mask = isnull(values) if not _is_floating_dtype(values): values = values.astype('f8') - mask = isnull(values) count, _ = _get_counts_nanvar(mask, axis, ddof) return np.sqrt(var)/np.sqrt(count) @@ -448,12 +457,13 @@ def nanargmin(values, axis=None, skipna=True): return result -@disallow('M8') +@disallow('M8','m8') def nanskew(values, axis=None, skipna=True): + + mask = isnull(values) if not _is_floating_dtype(values): values = values.astype('f8') - mask = isnull(values) count = _get_counts(mask, axis) if skipna: @@ -482,12 +492,13 @@ def nanskew(values, axis=None, skipna=True): return result -@disallow('M8') +@disallow('M8','m8') def nankurt(values, axis=None, skipna=True): + + mask = isnull(values) if not _is_floating_dtype(values): values = values.astype('f8') - mask = isnull(values) count = _get_counts(mask, axis) if skipna: @@ -516,7 +527,7 @@ def nankurt(values, axis=None, skipna=True): return result -@disallow('M8') +@disallow('M8','m8') def nanprod(values, axis=None, skipna=True): mask = isnull(values) if skipna and not _is_any_int_dtype(values): @@ -580,7 +591,7 @@ def _zero_out_fperr(arg): return 0 if np.abs(arg) < 1e-14 else arg -@disallow('M8') +@disallow('M8','m8') def nancorr(a, b, method='pearson', min_periods=None): """ a, b: ndarrays @@ -627,7 +638,7 @@ def _spearman(a, b): return _cor_methods[method] -@disallow('M8') +@disallow('M8','m8') def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') @@ -709,6 +720,10 @@ def unique1d(values): table = _hash.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('M8[ns]') + elif np.issubdtype(values.dtype, np.timedelta64): + table = _hash.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('m8[ns]') elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 16e6e40802a95..068cdff7fcf2d 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -12,7 +12,6 @@ import pandas.index as _index from pandas.util.decorators import Appender import pandas.core.common as com -import pandas.core.array as pa import pandas.computation.expressions as expressions from pandas.core.common import(bind_method, is_list_like, notnull, isnull, _values_from_object, _maybe_match_name) @@ -162,20 +161,39 @@ def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, if passed, will not set functions with names in exclude """ radd_func = radd_func or operator.add + # in frame, special methods have default_axis = None, comp methods use # 'columns' + new_methods = _create_methods(arith_method, radd_func, comp_method, bool_method, use_numexpr, default_axis=None, special=True) # inplace operators (I feel like these should get passed an `inplace=True` # or just be removed + + def _wrap_inplace_method(method): + """ + return an inplace wrapper for this method + """ + + def f(self, other): + result = method(self, other) + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace(result.reindex_like(self,copy=False)._data, + verify_is_copy=False) + + return self + return f + new_methods.update(dict( - __iadd__=new_methods["__add__"], - __isub__=new_methods["__sub__"], - __imul__=new_methods["__mul__"], - __itruediv__=new_methods["__truediv__"], - __ipow__=new_methods["__pow__"] + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), )) if not compat.PY3: new_methods["__idiv__"] = new_methods["__div__"] @@ -314,7 +332,7 @@ def _validate(self): def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" - from pandas.tseries.timedeltas import _possibly_cast_to_timedelta + from pandas.tseries.timedeltas import to_timedelta coerce = True if not is_list_like(values): @@ -332,12 +350,12 @@ def _convert_to_array(self, values, name=None, other=None): # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() - elif not (isinstance(values, (pa.Array, pd.Series)) and + elif not (isinstance(values, (np.ndarray, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here - values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]') + values = to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': @@ -349,14 +367,14 @@ def _convert_to_array(self, values, name=None, other=None): "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets - os = pa.array([getattr(v, 'delta', None) for v in values]) + os = np.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError("cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format( ', '.join([com.pprint_thing(v) for v in values[mask]]))) - values = _possibly_cast_to_timedelta(os, coerce=coerce) + values = to_timedelta(os, coerce=coerce) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) @@ -366,10 +384,10 @@ def _convert_to_array(self, values, name=None, other=None): else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' - 'operation'.format(pa.array(values).dtype)) + 'operation'.format(np.array(values).dtype)) else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" - " operation".format(pa.array(values).dtype)) + " operation".format(np.array(values).dtype)) return values @@ -408,7 +426,7 @@ def _convert_for_datetime(self, lvalues, rvalues): if mask is not None: if mask.any(): def f(x): - x = pa.array(x, dtype=self.dtype) + x = np.array(x, dtype=self.dtype) np.putmask(x, mask, self.fill_value) return x self.wrap_results = f @@ -449,19 +467,19 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: - if isinstance(y, (pa.Array, pd.Series, pd.Index)): + if isinstance(y, (np.ndarray, pd.Series, pd.Index)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], _values_from_object(y[mask])) - elif isinstance(x, pa.Array): - result = pa.empty(len(x), dtype=x.dtype) + elif isinstance(x, np.ndarray): + result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) else: raise TypeError("{typ} cannot perform the operation {op}".format(typ=type(x).__name__,op=str_rep)) - result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) + result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) result = com._fill_zeros(result, x, y, name, fill_zeros) return result @@ -522,11 +540,16 @@ def _comp_method_SERIES(op, name, str_rep, masker=False): code duplication. """ def na_op(x, y): + + if com.is_categorical_dtype(x) != (not np.isscalar(y) and com.is_categorical_dtype(y)): + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'series np.asarray(cat)'." + raise TypeError(msg.format(op=op,typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) - if isinstance(y, (pa.Array, pd.Series)): + if isinstance(y, (np.ndarray, pd.Series)): if y.dtype != np.object_: result = lib.vec_compare(x, y.astype(np.object_), op) else: @@ -553,11 +576,16 @@ def wrapper(self, other): index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented - elif isinstance(other, (pa.Array, pd.Series, pd.Index)): + elif isinstance(other, (np.ndarray, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): + if not com.is_categorical_dtype(self): + msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ + "If you want to compare values, use 'series np.asarray(other)'." + raise TypeError(msg.format(op=op,typ=self.dtype)) else: mask = isnull(self) @@ -565,7 +593,7 @@ def wrapper(self, other): values = self.get_values() other = _index.convert_scalar(values,_values_from_object(other)) - if issubclass(values.dtype.type, np.datetime64): + if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): values = values.view('i8') # scalars @@ -600,7 +628,7 @@ def na_op(x, y): if isinstance(y, list): y = lib.list_to_object_array(y) - if isinstance(y, (pa.Array, pd.Series)): + if isinstance(y, (np.ndarray, pd.Series)): if (x.dtype == np.bool_ and y.dtype == np.bool_): # pragma: no cover result = op(x, y) # when would this be hit? @@ -678,7 +706,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): self._get_axis_number(axis) if isinstance(other, pd.Series): return self._binop(other, op, level=level, fill_value=fill_value) - elif isinstance(other, (pa.Array, pd.Series, list, tuple)): + elif isinstance(other, (np.ndarray, pd.Series, list, tuple)): if len(other) != len(self): raise ValueError('Lengths must be equal') return self._binop(self._constructor(other, self.index), op, @@ -775,7 +803,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # casted = self._constructor_sliced(other, index=self.columns) casted = pd.Series(other, index=self.columns) return self._combine_series(casted, na_op, fill_value, axis, level) - elif isinstance(other, np.ndarray): + elif isinstance(other, np.ndarray) and other.ndim: # skips np scalar if other.ndim == 1: if axis is not None and self._get_axis_name(axis) == 'index': # casted = self._constructor_sliced(other, @@ -915,10 +943,10 @@ def na_op(x, y): except TypeError: # TODO: might need to find_common_type here? - result = pa.empty(len(x), dtype=x.dtype) + result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) - result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) + result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) result = com._fill_zeros(result, x, y, name, fill_zeros) return result diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 1e6ed56386f63..686a0c4f6cca4 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -7,6 +7,7 @@ OrderedDefaultdict) from pandas import compat import sys +import warnings import numpy as np from pandas.core.common import (PandasError, _try_sort, _default_index, _infer_dtype_from_scalar, notnull) @@ -98,7 +99,7 @@ def panel_index(time, panels, names=['time', 'panel']): panel_factor = Categorical.from_array(panels) labels = [time_factor.codes, panel_factor.codes] - levels = [time_factor.levels, panel_factor.levels] + levels = [time_factor.categories, panel_factor.categories] return MultiIndex(levels, labels, sortorder=None, names=names, verify_integrity=False) @@ -677,9 +678,9 @@ def _combine_frame(self, other, func, axis=0): self.minor_axis) def _combine_panel(self, other, func): - items = self.items + other.items - major = self.major_axis + other.major_axis - minor = self.minor_axis + other.minor_axis + items = self.items.union(other.items) + major = self.major_axis.union(other.major_axis) + minor = self.minor_axis.union(other.minor_axis) # could check that everything's the same size, but forget it this = self.reindex(items=items, major=major, minor=minor) @@ -710,7 +711,7 @@ def major_xs(self, key, copy=None): major_xs is only for getting, not setting values. MultiIndex Slicers is a generic way to get/set values on any level or levels - it is a superset of major_xs functionality, see :ref:`MultiIndex Slicers ` + it is a superset of major_xs functionality, see :ref:`MultiIndex Slicers ` """ if copy is not None: @@ -740,7 +741,7 @@ def minor_xs(self, key, copy=None): minor_xs is only for getting, not setting values. MultiIndex Slicers is a generic way to get/set values on any level or levels - it is a superset of minor_xs functionality, see :ref:`MultiIndex Slicers ` + it is a superset of minor_xs functionality, see :ref:`MultiIndex Slicers ` """ if copy is not None: @@ -770,7 +771,7 @@ def xs(self, key, axis=1, copy=None): xs is only for getting, not setting values. MultiIndex Slicers is a generic way to get/set values on any level or levels - it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` + it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` """ if copy is not None: diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 3eebd51190e3d..ec0a313ff5767 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -82,7 +82,7 @@ def _combine_with_constructor(self, other, func): # combine labels to form new axes new_axes = [] for a in self._AXIS_ORDERS: - new_axes.append(getattr(self, a) + getattr(other, a)) + new_axes.append(getattr(self, a).union(getattr(other, a))) # reindex: could check that everything's the same size, but forget it d = dict([(a, ax) for a, ax in zip(self._AXIS_ORDERS, new_axes)]) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b014ede6e65a8..bb6f6f4d00cd8 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -601,7 +601,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list(zip(*[ - lev.values.take(lab) for lev, lab in + lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] @@ -848,7 +848,7 @@ def lreshape(data, groups, dropna=True, label=None): keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) - id_cols = list(data.columns.diff(all_cols)) + id_cols = list(data.columns.difference(all_cols)) K = len(values[0]) @@ -979,27 +979,42 @@ def convert_dummies(data, cat_variables, prefix_sep='_'): ------- dummies : DataFrame """ + import warnings + + warnings.warn("'convert_dummies' is deprecated and will be removed " + "in a future release. Use 'get_dummies' instead.", + FutureWarning) + result = data.drop(cat_variables, axis=1) for variable in cat_variables: - dummies = get_dummies(data[variable], prefix=variable, - prefix_sep=prefix_sep) + dummies = _get_dummies_1d(data[variable], prefix=variable, + prefix_sep=prefix_sep) result = result.join(dummies) return result -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): +def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, + columns=None): """ Convert categorical variable into dummy/indicator variables Parameters ---------- - data : array-like or Series - prefix : string, default None + data : array-like, Series, or DataFrame + prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternativly, `prefix` + can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' - If appending prefix, separator/delimiter to use + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. + columns : list-like, default None + Column names in the DataFrame to be encoded. + If `columns` is None then all the columns with + `object` or `category` dtype will be converted. Returns ------- @@ -1031,12 +1046,74 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): 1 0 1 0 2 0 0 1 + >>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], + 'C': [1, 2, 3]}) + + >>> get_dummies(df, prefix=['col1', 'col2']): + C col1_a col1_b col2_a col2_b col2_c + 0 1 1 0 0 1 0 + 1 2 0 1 1 0 0 + 2 3 1 0 0 0 1 + See also ``Series.str.get_dummies``. """ + from pandas.tools.merge import concat + from itertools import cycle + + if isinstance(data, DataFrame): + # determine columns being encoded + + if columns is None: + columns_to_encode = data.select_dtypes(include=['object', + 'category']).columns + else: + columns_to_encode = columns + + # validate prefixes and separator to avoid silently dropping cols + def check_len(item, name): + length_msg = ("Length of '{0}' ({1}) did " + "not match the length of the columns " + "being encoded ({2}).") + + if com.is_list_like(item): + if not len(item) == len(columns_to_encode): + raise ValueError(length_msg.format(name, len(item), + len(columns_to_encode))) + + check_len(prefix, 'prefix') + check_len(prefix_sep, 'prefix_sep') + if isinstance(prefix, compat.string_types): + prefix = cycle([prefix]) + if isinstance(prefix, dict): + prefix = [prefix[col] for col in columns_to_encode] + + if prefix is None: + prefix = columns_to_encode + + # validate separators + if isinstance(prefix_sep, compat.string_types): + prefix_sep = cycle([prefix_sep]) + elif isinstance(prefix_sep, dict): + prefix_sep = [prefix_sep[col] for col in columns_to_encode] + + result = data.drop(columns_to_encode, axis=1) + with_dummies = [result] + for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): + + dummy = _get_dummies_1d(data[col], prefix=pre, + prefix_sep=sep, dummy_na=dummy_na) + with_dummies.append(dummy) + result = concat(with_dummies, axis=1) + else: + result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na) + return result + + +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) - levels = cat.levels + levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: @@ -1053,7 +1130,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: - levels = np.append(cat.levels, np.nan) + levels = np.append(cat.categories, np.nan) else: # reset NaN GH4446 dummy_mat[cat.codes == -1] = 0 @@ -1105,7 +1182,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.codes - items = cat.levels + items = cat.categories values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f0e651bfc5b1..0408d62ce302c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,8 +19,9 @@ is_list_like, _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, - ABCSparseArray, _maybe_match_name, - _ensure_object, SettingWithCopyError) + ABCSparseArray, _maybe_match_name, _coerce_to_dtype, + _ensure_object, SettingWithCopyError, + _maybe_box_datetimelike, ABCDataFrame) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index) from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices @@ -28,12 +29,12 @@ from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical from pandas.tseries.index import DatetimeIndex +from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.period import PeriodIndex, Period from pandas import compat from pandas.util.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict -import pandas.core.array as pa import pandas.core.ops as ops from pandas.core.algorithms import select_n @@ -143,7 +144,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = data._to_embed(keep_tz=True) copy = True - elif isinstance(data, pa.Array): + elif isinstance(data, np.ndarray): pass elif isinstance(data, Series): if name is None: @@ -163,12 +164,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if isinstance(index, DatetimeIndex): # coerce back to datetime objects for lookup data = lib.fast_multiget(data, index.astype('O'), - default=pa.NA) + default=np.nan) elif isinstance(index, PeriodIndex): data = [data.get(i, nan) for i in index] else: data = lib.fast_multiget(data, index.values, - default=pa.NA) + default=np.nan) except TypeError: data = [data.get(i, nan) for i in index] @@ -248,9 +249,7 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - from pandas.tseries.index import DatetimeIndex - from pandas.tseries.period import PeriodIndex - if not isinstance(labels, (DatetimeIndex, PeriodIndex)): + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): labels = DatetimeIndex(labels) # need to set here becuase we changed the index @@ -268,8 +267,9 @@ def _set_subtyp(self, is_all_dates): else: object.__setattr__(self, '_subtyp', 'series') - def _update_inplace(self, result): - return generic.NDFrame._update_inplace(self, result) + def _update_inplace(self, result, **kwargs): + # we want to call the generic version and not the IndexOpsMixin + return generic.NDFrame._update_inplace(self, result, **kwargs) # ndarray compatibility @property @@ -331,11 +331,27 @@ def compress(self, condition, axis=0, out=None, **kwargs): def nonzero(self): """ - return the a boolean array of the underlying data is nonzero + Return the indices of the elements that are non-zero - See also + This method is equivalent to calling `numpy.nonzero` on the + series data. For compatability with NumPy, the return value is + the same (a tuple with an array of indices for each dimension), + but it will always be a one-item tuple because series only have + one dimension. + + Examples -------- - numpy.ndarray.nonzero + >>> s = pd.Series([0, 3, 0, 4]) + >>> s.nonzero() + (array([1, 3]),) + >>> s.iloc[s.nonzero()[0]] + 1 3 + 3 4 + dtype: int64 + + See Also + -------- + numpy.nonzero """ return self.values.nonzero() @@ -530,6 +546,9 @@ def _get_with(self, key): if isinstance(key, slice): indexer = self.index._convert_slice_indexer(key, typ='getitem') return self._get_values(indexer) + elif isinstance(key, ABCDataFrame): + raise TypeError('Indexing a Series with DataFrame is not supported, '\ + 'use the appropriate DataFrame column') else: if isinstance(key, tuple): try: @@ -542,7 +561,7 @@ def _get_with(self, key): raise # pragma: no cover - if not isinstance(key, (list, pa.Array, Series, Index)): + if not isinstance(key, (list, np.ndarray, Series, Index)): key = list(key) if isinstance(key, Index): @@ -672,7 +691,7 @@ def _set_with(self, key, value): except Exception: pass - if not isinstance(key, (list, Series, pa.Array, Series)): + if not isinstance(key, (list, Series, np.ndarray, Series)): try: key = list(key) except: @@ -766,7 +785,7 @@ def get_value(self, label, takeable=False): value : scalar value """ if takeable is True: - return self.values[label] + return _maybe_box_datetimelike(self.values[label]) return self.index.get_value(self.values, label) def set_value(self, label, value, takeable=False): @@ -823,7 +842,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): resetted : DataFrame, or Series if drop == True """ if drop: - new_index = pa.arange(len(self)) + new_index = np.arange(len(self)) if level is not None and isinstance(self.index, MultiIndex): if not isinstance(level, (tuple, list)): level = [level] @@ -906,7 +925,7 @@ def _repr_footer(self): # Categorical if com.is_categorical_dtype(self.dtype): - level_info = self.cat._repr_level_info() + level_info = self.values._repr_categories_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), str(self.dtype.name), @@ -987,6 +1006,8 @@ def __iter__(self): return iter(self.values) elif np.issubdtype(self.dtype, np.datetime64): return (lib.Timestamp(x) for x in self.values) + elif np.issubdtype(self.dtype, np.timedelta64): + return (lib.Timedelta(x) for x in self.values) else: return iter(self.values) @@ -1093,7 +1114,7 @@ def count(self, level=None): # call cython function max_bin = len(level_index) labels = com._ensure_int64(self.index.labels[level]) - counts = lib.count_level_1d(mask.view(pa.uint8), + counts = lib.count_level_1d(mask.view(np.uint8), labels, max_bin) return self._constructor(counts, index=level_index).__finalize__(self) @@ -1153,7 +1174,7 @@ def idxmin(self, axis=None, out=None, skipna=True): """ i = nanops.nanargmin(_values_from_object(self), skipna=skipna) if i == -1: - return pa.NA + return np.nan return self.index[i] def idxmax(self, axis=None, out=None, skipna=True): @@ -1180,14 +1201,14 @@ def idxmax(self, axis=None, out=None, skipna=True): """ i = nanops.nanargmax(_values_from_object(self), skipna=skipna) if i == -1: - return pa.NA + return np.nan return self.index[i] # ndarray compat argmin = idxmin argmax = idxmax - @Appender(pa.Array.round.__doc__) + @Appender(np.ndarray.round.__doc__) def round(self, decimals=0, out=None): """ @@ -1226,9 +1247,7 @@ def quantile(self, q=0.5): 0.75 3.25 dtype: float64 """ - valid_values = self.dropna().values - if len(valid_values) == 0: - return pa.NA + valid = self.dropna() def multi(values, qs): if com.is_list_like(qs): @@ -1237,17 +1256,7 @@ def multi(values, qs): else: return _quantile(values, qs*100) - if com.is_datetime64_dtype(self): - values = _values_from_object(self).view('i8') - result = multi(values, q) - if com.is_list_like(q): - result = result.map(lib.Timestamp) - else: - result = lib.Timestamp(result) - else: - result = multi(valid_values, q) - - return result + return self._maybe_box(lambda values: multi(values, q), dropna=True) def ptp(self, axis=None, out=None): return _values_from_object(self).ptp(axis, out) @@ -1274,7 +1283,7 @@ def corr(self, other, method='pearson', """ this, other = self.align(other, join='inner', copy=False) if len(this) == 0: - return pa.NA + return np.nan return nanops.nancorr(this.values, other.values, method=method, min_periods=min_periods) @@ -1296,7 +1305,7 @@ def cov(self, other, min_periods=None): """ this, other = self.align(other, join='inner', copy=False) if len(this) == 0: - return pa.NA + return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) @@ -1368,6 +1377,65 @@ def dot(self, other): else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) + def searchsorted(self, v, side='left', sorter=None): + """Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Series `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + v : array_like + Values to insert into `a`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + Series.sort + Series.order + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Series([1, 2, 3]) + >>> x + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> x.searchsorted(4) + array([3]) + >>> x.searchsorted([0, 4]) + array([0, 3]) + >>> x.searchsorted([1, 3], side='left') + array([0, 2]) + >>> x.searchsorted([1, 3], side='right') + array([1, 3]) + >>> x.searchsorted([1, 2], side='right', sorter=[0, 2, 1]) + array([1, 3]) + """ + if sorter is not None: + sorter = com._ensure_platform_int(sorter) + + return self.values.searchsorted(Series(v).values, side=side, + sorter=sorter) + #------------------------------------------------------------------------------ # Combination @@ -1458,9 +1526,9 @@ def combine(self, other, func, fill_value=nan): result : Series """ if isinstance(other, Series): - new_index = self.index + other.index + new_index = self.index.union(other.index) new_name = _maybe_match_name(self, other) - new_values = pa.empty(len(new_index), dtype=self.dtype) + new_values = np.empty(len(new_index), dtype=self.dtype) for i, idx in enumerate(new_index): lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) @@ -1484,7 +1552,7 @@ def combine_first(self, other): ------- y : Series """ - new_index = self.index + other.index + new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) other = other.reindex(new_index, copy=False) name = _maybe_match_name(self, other) @@ -1686,12 +1754,12 @@ def _try_kind_sort(arr): return arr.argsort(kind='quicksort') arr = self.values - sortedIdx = pa.empty(len(self), dtype=np.int32) + sortedIdx = np.empty(len(self), dtype=np.int32) bad = isnull(arr) good = ~bad - idx = pa.arange(len(self)) + idx = np.arange(len(self)) argsorted = _try_kind_sort(arr[good]) @@ -1924,7 +1992,7 @@ def map(self, arg, na_action=None): mask = isnull(values) def map_f(values, f): - return lib.map_infer_mask(values, f, mask.view(pa.uint8)) + return lib.map_infer_mask(values, f, mask.view(np.uint8)) else: map_f = lib.map_infer @@ -2000,9 +2068,49 @@ def _reduce(self, op, axis=0, skipna=True, numeric_only=None, delegate = self.values if isinstance(delegate, np.ndarray): return op(delegate, skipna=skipna, **kwds) + return delegate._reduce(op=op, axis=axis, skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, name=name, **kwds) + def _maybe_box(self, func, dropna=False): + """ + evaluate a function with possible input/output conversion if we are i8 + + Parameters + ---------- + dropna : bool, default False + whether to drop values if necessary + + """ + if dropna: + values = self.dropna().values + else: + values = self.values + + if com.needs_i8_conversion(self): + boxer = com.i8_boxer(self) + + if len(values) == 0: + return boxer(iNaT) + + values = values.view('i8') + result = func(values) + + if com.is_list_like(result): + result = result.map(boxer) + else: + result = boxer(result) + + else: + + # let the function return nan if appropriate + if dropna: + if len(values) == 0: + return np.nan + result = func(values) + + return result + def _reindex_indexer(self, new_index, indexer, copy): if indexer is None: if copy: @@ -2200,7 +2308,8 @@ def to_csv(self, path, index=True, sep=",", na_rep='', Parameters ---------- - path : string file path or file handle / StringIO + path : string file path or file handle / StringIO. If None is provided + the result is returned as a string. na_rep : string, default '' Missing data representation float_format : string, default None @@ -2224,10 +2333,13 @@ def to_csv(self, path, index=True, sep=",", na_rep='', """ from pandas.core.frame import DataFrame df = DataFrame(self) - df.to_csv(path, index=index, sep=sep, na_rep=na_rep, + # result is only a string if no path provided, otherwise None + result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, float_format=float_format, header=header, index_label=index_label, mode=mode, nanRep=nanRep, encoding=encoding, date_format=date_format) + if path is None: + return result def dropna(self, axis=0, inplace=False, **kwargs): """ @@ -2311,7 +2423,7 @@ def asof(self, where): start = start.ordinal if where < start: - return pa.NA + return np.nan loc = self.index.searchsorted(where, side='right') if loc > 0: loc -= 1 @@ -2390,11 +2502,12 @@ def dt(self): #------------------------------------------------------------------------------ # Categorical methods - @property + @cache_readonly def cat(self): + from pandas.core.categorical import CategoricalAccessor if not com.is_categorical_dtype(self.dtype): raise TypeError("Can only use .cat accessor with a 'category' dtype") - return self.values + return CategoricalAccessor(self.values, self.index) Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) @@ -2425,6 +2538,11 @@ def _sanitize_index(data, index, copy=False): data = data._to_embed(keep_tz=True) if copy: data = data.copy() + elif isinstance(data, np.ndarray): + + # coerce datetimelike types + if data.dtype.kind in ['M','m']: + data = _sanitize_array(data, index, copy=copy) return data @@ -2433,7 +2551,7 @@ def _sanitize_array(data, index, dtype=None, copy=False, """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """ if dtype is not None: - dtype = np.dtype(dtype) + dtype = _coerce_to_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) @@ -2452,16 +2570,18 @@ def _try_cast(arr, take_fast_path): try: arr = _possibly_cast_to_datetime(arr, dtype) - subarr = pa.array(arr, dtype=dtype, copy=copy) + subarr = np.array(arr, dtype=dtype, copy=copy) except (ValueError, TypeError): - if dtype is not None and raise_cast_failure: + if com.is_categorical_dtype(dtype): + subarr = Categorical(arr) + elif dtype is not None and raise_cast_failure: raise - else: # pragma: no cover - subarr = pa.array(arr, dtype=object, copy=copy) + else: + subarr = np.array(arr, dtype=object, copy=copy) return subarr # GH #846 - if isinstance(data, (pa.Array, Index, Series)): + if isinstance(data, (np.ndarray, Index, Series)): subarr = np.array(data, copy=False) if dtype is not None: @@ -2506,7 +2626,7 @@ def _try_cast(arr, take_fast_path): except Exception: if raise_cast_failure: # pragma: no cover raise - subarr = pa.array(data, dtype=object, copy=copy) + subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: @@ -2520,7 +2640,7 @@ def _try_cast(arr, take_fast_path): # scalar like if subarr.ndim == 0: if isinstance(data, list): # pragma: no cover - subarr = pa.array(data, dtype=object) + subarr = np.array(data, dtype=object) elif index is not None: value = data @@ -2531,7 +2651,7 @@ def _try_cast(arr, take_fast_path): # need to possibly convert the value here value = _possibly_cast_to_datetime(value, dtype) - subarr = pa.empty(len(index), dtype=dtype) + subarr = np.empty(len(index), dtype=dtype) subarr.fill(value) else: @@ -2544,11 +2664,11 @@ def _try_cast(arr, take_fast_path): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: value = subarr[0] - subarr = pa.empty(len(index), dtype=subarr.dtype) + subarr = np.empty(len(index), dtype=subarr.dtype) subarr.fill(value) elif subarr.ndim > 1: - if isinstance(data, pa.Array): + if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = _asarray_tuplesafe(data, dtype=dtype) @@ -2556,7 +2676,7 @@ def _try_cast(arr, take_fast_path): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, compat.string_types): - subarr = pa.array(data, dtype=object, copy=copy) + subarr = np.array(data, dtype=object, copy=copy) return subarr diff --git a/pandas/index.pyx b/pandas/index.pyx index 3dcdbf207fb3f..d6e358a96e904 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -1,7 +1,7 @@ from numpy cimport ndarray from numpy cimport (float64_t, int32_t, int64_t, uint8_t, - NPY_DATETIME) + NPY_DATETIME, NPY_TIMEDELTA) cimport cython cimport numpy as cnp @@ -16,7 +16,7 @@ import numpy as np cimport tslib from hashtable cimport * from pandas import algos, tslib, hashtable as _hash -from pandas.tslib import Timestamp +from pandas.tslib import Timestamp, Timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) @@ -57,6 +57,8 @@ cdef inline is_definitely_invalid_key(object val): def get_value_at(ndarray arr, object loc): if arr.descr.type_num == NPY_DATETIME: return Timestamp(util.get_value_at(arr, loc)) + elif arr.descr.type_num == NPY_TIMEDELTA: + return Timedelta(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) def set_value_at(ndarray arr, object loc, object val): @@ -108,6 +110,8 @@ cdef class IndexEngine: else: if arr.descr.type_num == NPY_DATETIME: return Timestamp(util.get_value_at(arr, loc)) + elif arr.descr.type_num == NPY_TIMEDELTA: + return Timedelta(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) cpdef set_value(self, ndarray arr, object key, object value): @@ -498,6 +502,9 @@ cdef class ObjectEngine(IndexEngine): cdef class DatetimeEngine(Int64Engine): + cdef _get_box_dtype(self): + return 'M8[ns]' + def __contains__(self, object val): if self.over_size_threshold and self.is_monotonic: if not self.is_unique: @@ -559,26 +566,31 @@ cdef class DatetimeEngine(Int64Engine): def get_indexer(self, values): self._ensure_mapping_populated() - if values.dtype != 'M8[ns]': + if values.dtype != self._get_box_dtype(): return np.repeat(-1, len(values)).astype('i4') values = np.asarray(values).view('i8') return self.mapping.lookup(values) def get_pad_indexer(self, other, limit=None): - if other.dtype != 'M8[ns]': + if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') return algos.pad_int64(self._get_index_values(), other, limit=limit) def get_backfill_indexer(self, other, limit=None): - if other.dtype != 'M8[ns]': + if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') return algos.backfill_int64(self._get_index_values(), other, limit=limit) +cdef class TimedeltaEngine(DatetimeEngine): + + cdef _get_box_dtype(self): + return 'm8[ns]' + cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: if isinstance(value,np.ndarray): @@ -589,6 +601,15 @@ cpdef convert_scalar(ndarray arr, object value): return iNaT else: return Timestamp(value).value + elif arr.descr.type_num == NPY_TIMEDELTA: + if isinstance(value,np.ndarray): + pass + elif isinstance(value, Timedelta): + return value.value + elif value is None or value != value: + return iNaT + else: + return Timedelta(value).value if issubclass(arr.dtype.type, (np.integer, np.bool_)): if util.is_float_object(value) and value != value: diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index 204eeab74196e..dfa46156aaead 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -1,5 +1,5 @@ """ io on the clipboard """ -from pandas import compat, get_option, DataFrame +from pandas import compat, get_option, option_context, DataFrame from pandas.compat import StringIO @@ -91,7 +91,8 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover if isinstance(obj, DataFrame): # str(df) has various unhelpful defaults, like truncation - objstr = obj.to_string() + with option_context('display.max_colwidth', 999999): + objstr = obj.to_string(**kwargs) else: objstr = str(obj) clipboard_set(objstr) diff --git a/pandas/io/data.py b/pandas/io/data.py index c40b91ffa91c9..f0b078944d8ea 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -735,16 +735,20 @@ def _get_option_data(self, month, year, expiry, name): raise RemoteDataError("Table location {0} invalid, {1} tables" " found".format(table_loc, ntables)) - option_data = _parse_options_data(tables[table_loc]) - option_data['Type'] = name[:-1] - option_data = self._process_data(option_data, name[:-1]) + try: + option_data = _parse_options_data(tables[table_loc]) + option_data['Type'] = name[:-1] + option_data = self._process_data(option_data, name[:-1]) + + if month == CUR_MONTH and year == CUR_YEAR: + setattr(self, name, option_data) - if month == CUR_MONTH and year == CUR_YEAR: + name += m1 + str(year)[-2:] setattr(self, name, option_data) + return option_data - name += m1 + str(year)[-2:] - setattr(self, name, option_data) - return option_data + except (Exception) as e: + raise RemoteDataError("Cannot retrieve Table data {0}".format(str(e))) def get_call_data(self, month=None, year=None, expiry=None): """ @@ -1139,7 +1143,7 @@ def _get_expiry_months(self): try: links = root.xpath('.//*[@id="yfncsumtab"]')[0].xpath('.//a') except IndexError: - return RemoteDataError('Expiry months not available') + raise RemoteDataError('Expiry months not available') month_gen = (element.attrib['href'].split('=')[-1] for element in links diff --git a/pandas/io/excel.py b/pandas/io/excel.py index f81cf6502a0e6..424518cbde4f8 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -46,6 +46,20 @@ def register_writer(klass): def get_writer(engine_name): + if engine_name == 'openpyxl': + try: + import openpyxl + + # with version-less openpyxl engine + # make sure we make the intelligent choice for the user + if LooseVersion(openpyxl.__version__) < '2.0.0': + return _writers['openpyxl1'] + else: + return _writers['openpyxl2'] + except ImportError: + # fall through to normal exception handling below + pass + try: return _writers[engine_name] except KeyError: @@ -57,7 +71,7 @@ def read_excel(io, sheetname=0, **kwds): Parameters ---------- - io : string, file-like object, or xlrd workbook. + io : string, file-like object, or xlrd workbook. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx @@ -152,7 +166,7 @@ def __init__(self, io, **kwds): self.book = io elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): # N.B. xlrd.Book has a read attribute too - data = io.read() + data = io.read() self.book = xlrd.open_workbook(file_contents=data) else: raise ValueError('Must explicitly set engine if not passing in' @@ -527,20 +541,20 @@ def close(self): return self.save() -class _OpenpyxlWriter(ExcelWriter): - engine = 'openpyxl' +class _Openpyxl1Writer(ExcelWriter): + engine = 'openpyxl1' supported_extensions = ('.xlsx', '.xlsm') + openpyxl_majorver = 1 def __init__(self, path, engine=None, **engine_kwargs): - if not openpyxl_compat.is_compat(): + if not openpyxl_compat.is_compat(major_ver=self.openpyxl_majorver): raise ValueError('Installed openpyxl is not supported at this ' - 'time. Use >={0} and ' - '<{1}.'.format(openpyxl_compat.start_ver, - openpyxl_compat.stop_ver)) + 'time. Use {0}.x.y.' + .format(self.openpyxl_majorver)) # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) + super(_Openpyxl1Writer, self).__init__(path, **engine_kwargs) # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -632,9 +646,430 @@ def _convert_to_style(cls, style_dict): return xls_style +register_writer(_Openpyxl1Writer) + + +class _OpenpyxlWriter(_Openpyxl1Writer): + engine = 'openpyxl' + register_writer(_OpenpyxlWriter) +class _Openpyxl2Writer(_Openpyxl1Writer): + """ + Note: Support for OpenPyxl v2 is currently EXPERIMENTAL (GH7565). + """ + engine = 'openpyxl2' + openpyxl_majorver = 2 + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using openpyxl. + from openpyxl.cell import get_column_letter + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.create_sheet() + wks.title = sheet_name + self.sheets[sheet_name] = wks + + for cell in cells: + colletter = get_column_letter(startcol + cell.col + 1) + xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) + xcell.value = _conv_value(cell.val) + style_kwargs = {} + + # Apply format codes before cell.style to allow override + if isinstance(cell.val, datetime.datetime): + style_kwargs.update(self._convert_to_style_kwargs({ + 'number_format':{'format_code': self.datetime_format}})) + elif isinstance(cell.val, datetime.date): + style_kwargs.update(self._convert_to_style_kwargs({ + 'number_format':{'format_code': self.date_format}})) + + if cell.style: + style_kwargs.update(self._convert_to_style_kwargs(cell.style)) + + if style_kwargs: + xcell.style = xcell.style.copy(**style_kwargs) + + if cell.mergestart is not None and cell.mergeend is not None: + cletterstart = get_column_letter(startcol + cell.col + 1) + cletterend = get_column_letter(startcol + cell.mergeend + 1) + + wks.merge_cells('%s%s:%s%s' % (cletterstart, + startrow + cell.row + 1, + cletterend, + startrow + cell.mergestart + 1)) + + # Excel requires that the format of the first cell in a merged + # range is repeated in the rest of the merged range. + if style_kwargs: + first_row = startrow + cell.row + 1 + last_row = startrow + cell.mergestart + 1 + first_col = startcol + cell.col + 1 + last_col = startcol + cell.mergeend + 1 + + for row in range(first_row, last_row + 1): + for col in range(first_col, last_col + 1): + if row == first_row and col == first_col: + # Ignore first cell. It is already handled. + continue + colletter = get_column_letter(col) + xcell = wks.cell("%s%s" % (colletter, row)) + xcell.style = xcell.style.copy(**style_kwargs) + + @classmethod + def _convert_to_style_kwargs(cls, style_dict): + """ + Convert a style_dict to a set of kwargs suitable for initializing + or updating-on-copy an openpyxl v2 style object + Parameters + ---------- + style_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'font' + 'fill' + 'border' ('borders') + 'alignment' + 'number_format' + 'protection' + Returns + ------- + style_kwargs : dict + A dict with the same, normalized keys as ``style_dict`` but each + value has been replaced with a native openpyxl style object of the + appropriate class. + """ + + _style_key_map = { + 'borders': 'border', + } + + style_kwargs = {} + for k, v in style_dict.items(): + if k in _style_key_map: + k = _style_key_map[k] + _conv_to_x = getattr(cls, '_convert_to_{0}'.format(k), + lambda x: None) + new_v = _conv_to_x(v) + if new_v: + style_kwargs[k] = new_v + + return style_kwargs + + + @classmethod + def _convert_to_color(cls, color_spec): + """ + Convert ``color_spec`` to an openpyxl v2 Color object + Parameters + ---------- + color_spec : str, dict + A 32-bit ARGB hex string, or a dict with zero or more of the + following keys. + 'rgb' + 'indexed' + 'auto' + 'theme' + 'tint' + 'index' + 'type' + Returns + ------- + color : openpyxl.styles.Color + """ + + from openpyxl.styles import Color + + if isinstance(color_spec, str): + return Color(color_spec) + else: + return Color(**color_spec) + + + @classmethod + def _convert_to_font(cls, font_dict): + """ + Convert ``font_dict`` to an openpyxl v2 Font object + Parameters + ---------- + font_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'name' + 'size' ('sz') + 'bold' ('b') + 'italic' ('i') + 'underline' ('u') + 'strikethrough' ('strike') + 'color' + 'vertAlign' ('vertalign') + 'charset' + 'scheme' + 'family' + 'outline' + 'shadow' + 'condense' + Returns + ------- + font : openpyxl.styles.Font + """ + + from openpyxl.styles import Font + + _font_key_map = { + 'sz': 'size', + 'b': 'bold', + 'i': 'italic', + 'u': 'underline', + 'strike': 'strikethrough', + 'vertalign': 'vertAlign', + } + + font_kwargs = {} + for k, v in font_dict.items(): + if k in _font_key_map: + k = _font_key_map[k] + if k == 'color': + v = cls._convert_to_color(v) + font_kwargs[k] = v + + return Font(**font_kwargs) + + + @classmethod + def _convert_to_stop(cls, stop_seq): + """ + Convert ``stop_seq`` to a list of openpyxl v2 Color objects, + suitable for initializing the ``GradientFill`` ``stop`` parameter. + Parameters + ---------- + stop_seq : iterable + An iterable that yields objects suitable for consumption by + ``_convert_to_color``. + Returns + ------- + stop : list of openpyxl.styles.Color + """ + + return map(cls._convert_to_color, stop_seq) + + + @classmethod + def _convert_to_fill(cls, fill_dict): + """ + Convert ``fill_dict`` to an openpyxl v2 Fill object + Parameters + ---------- + fill_dict : dict + A dict with one or more of the following keys (or their synonyms), + 'fill_type' ('patternType', 'patterntype') + 'start_color' ('fgColor', 'fgcolor') + 'end_color' ('bgColor', 'bgcolor') + or one or more of the following keys (or their synonyms). + 'type' ('fill_type') + 'degree' + 'left' + 'right' + 'top' + 'bottom' + 'stop' + Returns + ------- + fill : openpyxl.styles.Fill + """ + + from openpyxl.styles import PatternFill, GradientFill + + _pattern_fill_key_map = { + 'patternType': 'fill_type', + 'patterntype': 'fill_type', + 'fgColor': 'start_color', + 'fgcolor': 'start_color', + 'bgColor': 'end_color', + 'bgcolor': 'end_color', + } + + _gradient_fill_key_map = { + 'fill_type': 'type', + } + + pfill_kwargs = {} + gfill_kwargs = {} + for k, v in fill_dict.items(): + pk = gk = None + if k in _pattern_fill_key_map: + pk = _pattern_fill_key_map[k] + if k in _gradient_fill_key_map: + gk = _gradient_fill_key_map[k] + if pk in ['start_color', 'end_color']: + v = cls._convert_to_color(v) + if gk == 'stop': + v = cls._convert_to_stop(v) + if pk: + pfill_kwargs[pk] = v + elif gk: + gfill_kwargs[gk] = v + else: + pfill_kwargs[k] = v + gfill_kwargs[k] = v + + try: + return PatternFill(**pfill_kwargs) + except TypeError: + return GradientFill(**gfill_kwargs) + + + @classmethod + def _convert_to_side(cls, side_spec): + """ + Convert ``side_spec`` to an openpyxl v2 Side object + Parameters + ---------- + side_spec : str, dict + A string specifying the border style, or a dict with zero or more + of the following keys (or their synonyms). + 'style' ('border_style') + 'color' + Returns + ------- + side : openpyxl.styles.Side + """ + + from openpyxl.styles import Side + + _side_key_map = { + 'border_style': 'style', + } + + if isinstance(side_spec, str): + return Side(style=side_spec) + + side_kwargs = {} + for k, v in side_spec.items(): + if k in _side_key_map: + k = _side_key_map[k] + if k == 'color': + v = cls._convert_to_color(v) + side_kwargs[k] = v + + return Side(**side_kwargs) + + + @classmethod + def _convert_to_border(cls, border_dict): + """ + Convert ``border_dict`` to an openpyxl v2 Border object + Parameters + ---------- + border_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'left' + 'right' + 'top' + 'bottom' + 'diagonal' + 'diagonal_direction' + 'vertical' + 'horizontal' + 'diagonalUp' ('diagonalup') + 'diagonalDown' ('diagonaldown') + 'outline' + Returns + ------- + border : openpyxl.styles.Border + """ + + from openpyxl.styles import Border + + _border_key_map = { + 'diagonalup': 'diagonalUp', + 'diagonaldown': 'diagonalDown', + } + + border_kwargs = {} + for k, v in border_dict.items(): + if k in _border_key_map: + k = _border_key_map[k] + if k == 'color': + v = cls._convert_to_color(v) + if k in ['left', 'right', 'top', 'bottom', 'diagonal']: + v = cls._convert_to_side(v) + border_kwargs[k] = v + + return Border(**border_kwargs) + + + @classmethod + def _convert_to_alignment(cls, alignment_dict): + """ + Convert ``alignment_dict`` to an openpyxl v2 Alignment object + Parameters + ---------- + alignment_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'horizontal' + 'vertical' + 'text_rotation' + 'wrap_text' + 'shrink_to_fit' + 'indent' + Returns + ------- + alignment : openpyxl.styles.Alignment + """ + + from openpyxl.styles import Alignment + + return Alignment(**alignment_dict) + + + @classmethod + def _convert_to_number_format(cls, number_format_dict): + """ + Convert ``number_format_dict`` to an openpyxl v2.1.0 number format + initializer. + Parameters + ---------- + number_format_dict : dict + A dict with zero or more of the following keys. + 'format_code' : str + Returns + ------- + number_format : str + """ + try: + # >= 2.0.0 < 2.1.0 + from openpyxl.styles import NumberFormat + return NumberFormat(**number_format_dict) + except: + # >= 2.1.0 + return number_format_dict['format_code'] + + @classmethod + def _convert_to_protection(cls, protection_dict): + """ + Convert ``protection_dict`` to an openpyxl v2 Protection object. + Parameters + ---------- + protection_dict : dict + A dict with zero or more of the following keys. + 'locked' + 'hidden' + Returns + ------- + """ + + from openpyxl.styles import Protection + + return Protection(**protection_dict) + + +register_writer(_Openpyxl2Writer) + + class _XlwtWriter(ExcelWriter): engine = 'xlwt' supported_extensions = ('.xls',) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 76848a62d0d5f..20c1e9f591081 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -6,7 +6,6 @@ import uuid import numpy as np -import pkg_resources from distutils.version import LooseVersion from pandas import compat @@ -20,46 +19,54 @@ _GOOGLE_FLAGS_INSTALLED = False _GOOGLE_FLAGS_VALID_VERSION = False _HTTPLIB2_INSTALLED = False +_SETUPTOOLS_INSTALLED = False if not compat.PY3: - + try: - from apiclient.discovery import build - from apiclient.http import MediaFileUpload - from apiclient.errors import HttpError + import pkg_resources + _SETUPTOOLS_INSTALLED = True + except ImportError: + _SETUPTOOLS_INSTALLED = False + + if _SETUPTOOLS_INSTALLED: + try: + from apiclient.discovery import build + from apiclient.http import MediaFileUpload + from apiclient.errors import HttpError - from oauth2client.client import OAuth2WebServerFlow - from oauth2client.client import AccessTokenRefreshError - from oauth2client.client import flow_from_clientsecrets - from oauth2client.file import Storage - from oauth2client.tools import run - _GOOGLE_API_CLIENT_INSTALLED=True - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + from oauth2client.client import OAuth2WebServerFlow + from oauth2client.client import AccessTokenRefreshError + from oauth2client.client import flow_from_clientsecrets + from oauth2client.file import Storage + from oauth2client.tools import run + _GOOGLE_API_CLIENT_INSTALLED=True + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version - if LooseVersion(_GOOGLE_API_CLIENT_VERSION >= '1.2.0'): - _GOOGLE_API_CLIENT_VALID_VERSION = True + if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2.0': + _GOOGLE_API_CLIENT_VALID_VERSION = True - except ImportError: - _GOOGLE_API_CLIENT_INSTALLED = False + except ImportError: + _GOOGLE_API_CLIENT_INSTALLED = False - try: - import gflags as flags - _GOOGLE_FLAGS_INSTALLED = True + try: + import gflags as flags + _GOOGLE_FLAGS_INSTALLED = True - _GOOGLE_FLAGS_VERSION = pkg_resources.get_distribution('python-gflags').version + _GOOGLE_FLAGS_VERSION = pkg_resources.get_distribution('python-gflags').version - if LooseVersion(_GOOGLE_FLAGS_VERSION >= '2.0.0'): - _GOOGLE_FLAGS_VALID_VERSION = True + if LooseVersion(_GOOGLE_FLAGS_VERSION) >= '2.0': + _GOOGLE_FLAGS_VALID_VERSION = True - except ImportError: - _GOOGLE_FLAGS_INSTALLED = False + except ImportError: + _GOOGLE_FLAGS_INSTALLED = False - try: - import httplib2 - _HTTPLIB2_INSTALLED = True - except ImportError: - _HTTPLIB2_INSTALLED = False + try: + import httplib2 + _HTTPLIB2_INSTALLED = True + except ImportError: + _HTTPLIB2_INSTALLED = False logger = logging.getLogger('pandas.io.gbq') @@ -296,10 +303,14 @@ def _test_imports(): _GOOGLE_FLAGS_INSTALLED _GOOGLE_FLAGS_VALID_VERSION _HTTPLIB2_INSTALLED + _SETUPTOOLS_INSTALLED if compat.PY3: raise NotImplementedError("Google's libraries do not support Python 3 yet") + if not _SETUPTOOLS_INSTALLED: + raise ImportError('Could not import pkg_resources (setuptools).') + if not _GOOGLE_API_CLIENT_INSTALLED: raise ImportError('Could not import Google API Client.') diff --git a/pandas/io/html.py b/pandas/io/html.py index 1fe86201a8db0..402758815e95b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -645,9 +645,9 @@ def _parser_dispatch(flavor): if flavor in ('bs4', 'html5lib'): if not _HAS_HTML5LIB: - raise ImportError("html5lib not found please install it") + raise ImportError("html5lib not found, please install it") if not _HAS_BS4: - raise ImportError("bs4 not found please install it") + raise ImportError("BeautifulSoup4 (bs4) not found, please install it") if bs4.__version__ == LooseVersion('4.2.0'): raise ValueError("You're using a version" " of BeautifulSoup4 (4.2.0) that has been" @@ -658,7 +658,7 @@ def _parser_dispatch(flavor): " and later releases will work.") else: if not _HAS_LXML: - raise ImportError("lxml not found please install it") + raise ImportError("lxml not found, please install it") return _valid_parsers[flavor] @@ -711,9 +711,14 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, else: raise_with_traceback(retained) - return [_data_to_frame(table, header, index_col, skiprows, infer_types, - parse_dates, tupleize_cols, thousands) - for table in tables] + ret = [] + for table in tables: + try: + ret.append(_data_to_frame(table, header, index_col, skiprows, + infer_types, parse_dates, tupleize_cols, thousands)) + except StopIteration: # empty table + continue + return ret def read_html(io, match='.+', flavor=None, header=None, index_col=None, diff --git a/pandas/io/json.py b/pandas/io/json.py index 4ed325df9a747..9e8ef74545ef2 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -573,7 +573,9 @@ def nested_to_record(ds, prefix="", level=0): ------- d - dict or list of dicts, matching `ds` - Example: + Examples + -------- + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), nested=dict(e=dict(c=1,d=2),d=2))) Out[52]: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6d2afbad36e35..72d8b2720c747 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -65,8 +65,8 @@ class ParserWarning(Warning): a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example are skipped). Note that this parameter - ignores commented lines, so header=0 denotes the first line of - data rather than the first line of the file. + ignores commented lines and empty lines if ``skip_blank_lines=True``, so header=0 + denotes the first line of data rather than the first line of the file. skiprows : list-like or integer Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file @@ -110,10 +110,11 @@ class ParserWarning(Warning): comment : str, default None Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter - must be a single character. Also, fully commented lines - are ignored by the parameter `header` but not by `skiprows`. For example, - if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will - result in '1,2,3' being treated as the header. + must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` + but not by `skiprows`. For example, if comment='#', parsing + '#empty\n1,2,3\na,b,c' with `header=0` will result in '1,2,3' being + treated as the header. decimal : str, default '.' Character to recognize as decimal point. E.g. use ',' for European data nrows : int, default None @@ -132,7 +133,9 @@ class ParserWarning(Warning): delimiter : string, default None Alternative argument name for sep. Regular expressions are accepted. encoding : string, default None - Encoding to use for UTF when reading/writing (ex. 'utf-8') + Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python + standard encodings + `_ squeeze : boolean, default False If the parsed data only contains one column then return a Series na_filter : boolean, default True @@ -158,6 +161,8 @@ class ParserWarning(Warning): infer_datetime_format : boolean, default False If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing +skip_blank_lines : boolean, default True + If True, skip over blank lines rather than interpreting as NaN values Returns ------- @@ -286,6 +291,7 @@ def _read(filepath_or_buffer, kwds): 'mangle_dupe_cols': True, 'tupleize_cols': False, 'infer_datetime_format': False, + 'skip_blank_lines': True } @@ -301,7 +307,8 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.' + 'decimal': b'.', + 'float_precision': None } _fwf_defaults = { @@ -367,6 +374,7 @@ def parser_f(filepath_or_buffer, date_parser=None, memory_map=False, + float_precision=None, nrows=None, iterator=False, chunksize=None, @@ -376,7 +384,8 @@ def parser_f(filepath_or_buffer, squeeze=False, mangle_dupe_cols=True, tupleize_cols=False, - infer_datetime_format=False): + infer_datetime_format=False, + skip_blank_lines=True): # Alias sep -> delimiter. if delimiter is None: @@ -435,6 +444,7 @@ def parser_f(filepath_or_buffer, encoding=encoding, squeeze=squeeze, memory_map=memory_map, + float_precision=float_precision, na_filter=na_filter, compact_ints=compact_ints, @@ -447,7 +457,8 @@ def parser_f(filepath_or_buffer, buffer_lines=buffer_lines, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, - infer_datetime_format=infer_datetime_format) + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines) return _read(filepath_or_buffer, kwds) @@ -703,7 +714,7 @@ def read(self, nrows=None): df = DataFrame(col_dict, columns=columns, index=index) if self.squeeze and len(df.columns) == 1: - return df[df.columns[0]] + return df[df.columns[0]].copy() return df def _create_index(self, ret): @@ -1262,6 +1273,11 @@ def TextParser(*args, **kwds): If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format can be inferred, there often will be a large parsing speed-up. + float_precision : string, default None + Specifies which converter the C engine should use for floating-point + values. The options are None for the ordinary converter, + 'high' for the high-precision converter, and 'round_trip' for the + round-trip converter. """ kwds['engine'] = 'python' return TextFileReader(*args, **kwds) @@ -1336,6 +1352,7 @@ def __init__(self, f, **kwds): self.quoting = kwds['quoting'] self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.usecols = kwds['usecols'] + self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None @@ -1391,6 +1408,7 @@ def __init__(self, f, **kwds): # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory + if not self._has_complex_date_col: (index_names, self.orig_names, self.columns) = self._get_index_name(self.columns) @@ -1588,6 +1606,7 @@ def _infer_columns(self): while self.line_pos <= hr: line = self._next_line() + unnamed_count = 0 this_columns = [] for i, c in enumerate(line): @@ -1725,25 +1744,35 @@ def _next_line(self): line = self._check_comments([self.data[self.pos]])[0] self.pos += 1 # either uncommented or blank to begin with - if self._empty(self.data[self.pos - 1]) or line: + if not self.skip_blank_lines and (self._empty(self.data[ + self.pos - 1]) or line): break + elif self.skip_blank_lines: + ret = self._check_empty([line]) + if ret: + line = ret[0] + break except IndexError: raise StopIteration else: while self.pos in self.skiprows: - next(self.data) self.pos += 1 + next(self.data) while True: orig_line = next(self.data) line = self._check_comments([orig_line])[0] self.pos += 1 - if self._empty(orig_line) or line: + if not self.skip_blank_lines and (self._empty(orig_line) or line): break + elif self.skip_blank_lines: + ret = self._check_empty([line]) + if ret: + line = ret[0] + break self.line_pos += 1 self.buf.append(line) - return line def _check_comments(self, lines): @@ -1764,6 +1793,15 @@ def _check_comments(self, lines): ret.append(rl) return ret + def _check_empty(self, lines): + ret = [] + for l in lines: + # Remove empty lines and lines with only one whitespace value + if len(l) > 1 or len(l) == 1 and (not isinstance(l[0], + compat.string_types) or l[0].strip()): + ret.append(l) + return ret + def _check_thousands(self, lines): if self.thousands is None: return lines @@ -1899,7 +1937,6 @@ def _get_lines(self, rows=None): # already fetched some number if rows is not None: - # we already have the lines in the buffer if len(self.buf) >= rows: new_rows, self.buf = self.buf[:rows], self.buf[rows:] @@ -1964,6 +2001,8 @@ def _get_lines(self, rows=None): lines = lines[:-self.skip_footer] lines = self._check_comments(lines) + if self.skip_blank_lines: + lines = self._check_empty(lines) return self._check_thousands(lines) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 07e9abeaadbb4..f1745fe8579bb 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -332,6 +332,16 @@ def read_hdf(path_or_buf, key, **kwargs): key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, string_types): + + try: + exists = os.path.exists(path_or_buf) + + #if filepath is too long + except (TypeError,ValueError): + exists = False + + if not exists: + raise IOError('File %s does not exist' % path_or_buf) # can't auto open/close if we are using an iterator # so delegate to the iterator @@ -398,8 +408,8 @@ def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): try: import tables - except ImportError: # pragma: no cover - raise ImportError('HDFStore requires PyTables') + except ImportError as ex: # pragma: no cover + raise ImportError('HDFStore requires PyTables, "{ex}" problem importing'.format(ex=str(ex))) self._path = path if mode is None: @@ -970,7 +980,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, remain_values.extend(v) if remain_key is not None: ordered = value.axes[axis] - ordd = ordered - Index(remain_values) + ordd = ordered.difference(Index(remain_values)) ordd = sorted(ordered.get_indexer(ordd)) d[remain_key] = ordered.take(ordd) @@ -3245,7 +3255,7 @@ def get_blk_items(mgr, blocks): data_columns, min_itemsize) if len(data_columns): mgr = block_obj.reindex_axis( - Index(axis_labels) - Index(data_columns), + Index(axis_labels).difference(Index(data_columns)), axis=axis )._data @@ -3362,7 +3372,7 @@ def process_filter(field, filt): # if we have a multi-index, then need to include # the levels if self.is_multi_index: - filt = filt + Index(self.levels) + filt = filt.union(Index(self.levels)) takers = op(axis_values, filt) return obj.ix._getitem_axis(takers, @@ -3522,8 +3532,8 @@ def read(self, where=None, columns=None, **kwargs): return None factors = [Categorical.from_array(a.values) for a in self.index_axes] - levels = [f.levels for f in factors] - N = [len(f.levels) for f in factors] + levels = [f.categories for f in factors] + N = [len(f.categories) for f in factors] labels = [f.codes for f in factors] # compute the key diff --git a/pandas/io/sql.py b/pandas/io/sql.py index cb234f825a51e..09acfcaee976b 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1,22 +1,27 @@ +# -*- coding: utf-8 -*- """ Collection of query wrappers / abstractions to both facilitate data retrieval and to reduce dependency on DB-specific API. """ + from __future__ import print_function, division -from datetime import datetime, date, timedelta +from datetime import datetime, date import warnings import traceback -import itertools import re import numpy as np +import pandas.lib as lib import pandas.core.common as com from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame, Series +from pandas.core.common import isnull from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime +from contextlib import contextmanager + class SQLAlchemyRequired(ImportError): pass @@ -27,17 +32,18 @@ class DatabaseError(IOError): #------------------------------------------------------------------------------ -# Helper functions +#--- Helper functions _SQLALCHEMY_INSTALLED = None + def _is_sqlalchemy_engine(con): global _SQLALCHEMY_INSTALLED if _SQLALCHEMY_INSTALLED is None: try: import sqlalchemy _SQLALCHEMY_INSTALLED = True - + from distutils.version import LooseVersion ver = LooseVersion(sqlalchemy.__version__) # For sqlalchemy versions < 0.8.2, the BIGINT type is recognized @@ -46,7 +52,7 @@ def _is_sqlalchemy_engine(con): if ver < '0.8.2': from sqlalchemy import BigInteger from sqlalchemy.ext.compiler import compiles - + @compiles(BigInteger, 'sqlite') def compile_big_int_sqlite(type_, compiler, **kw): return 'INTEGER' @@ -77,7 +83,8 @@ def _handle_date_column(col, format=None): else: if format in ['D', 's', 'ms', 'us', 'ns']: return to_datetime(col, coerce=True, unit=format) - elif issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer): + elif (issubclass(col.dtype.type, np.floating) + or issubclass(col.dtype.type, np.integer)): # parse dates as timestamp format = 's' if format is None else format return to_datetime(col, coerce=True, unit=format) @@ -86,8 +93,9 @@ def _handle_date_column(col, format=None): def _parse_date_columns(data_frame, parse_dates): - """ Force non-datetime columns to be read as such. - Supports both string formatted and integer timestamp columns + """ + Force non-datetime columns to be read as such. + Supports both string formatted and integer timestamp columns """ # handle non-list entries for parse_dates gracefully if parse_dates is True or parse_dates is None or parse_dates is False: @@ -107,6 +115,21 @@ def _parse_date_columns(data_frame, parse_dates): return data_frame +def _wrap_result(data, columns, index_col=None, coerce_float=True, + parse_dates=None): + """Wrap result set of query in a DataFrame """ + + frame = DataFrame.from_records(data, columns=columns, + coerce_float=coerce_float) + + _parse_date_columns(frame, parse_dates) + + if index_col is not None: + frame.set_index(index_col, inplace=True) + + return frame + + def execute(sql, con, cur=None, params=None): """ Execute the given SQL query using the provided connection object. @@ -144,11 +167,12 @@ def _safe_fetch(cur): if not isinstance(result, list): result = list(result) return result - except Exception as e: # pragma: no cover + except Exception as e: # pragma: no cover excName = e.__class__.__name__ if excName == 'OperationalError': return [] + def tquery(sql, con=None, cur=None, retry=True): """ DEPRECATED. Returns list of tuples corresponding to each row in given sql @@ -186,7 +210,7 @@ def tquery(sql, con=None, cur=None, retry=True): con.commit() except Exception as e: excName = e.__class__.__name__ - if excName == 'OperationalError': # pragma: no cover + if excName == 'OperationalError': # pragma: no cover print('Failed to commit, may need to restart interpreter') else: raise @@ -198,7 +222,7 @@ def tquery(sql, con=None, cur=None, retry=True): if result and len(result[0]) == 1: # python 3 compat result = list(lzip(*result)[0]) - elif result is None: # pragma: no cover + elif result is None: # pragma: no cover result = [] return result @@ -206,8 +230,8 @@ def tquery(sql, con=None, cur=None, retry=True): def uquery(sql, con=None, cur=None, retry=True, params=None): """ - DEPRECATED. Does the same thing as tquery, but instead of returning results, it - returns the number of rows affected. Good for update queries. + DEPRECATED. Does the same thing as tquery, but instead of returning + results, it returns the number of rows affected. Good for update queries. To obtain the same result in the future, you can use the following: @@ -252,8 +276,9 @@ def uquery(sql, con=None, cur=None, retry=True, params=None): #------------------------------------------------------------------------------ #--- Read and write to DataFrames -def read_sql_table(table_name, con, index_col=None, coerce_float=True, - parse_dates=None, columns=None): +def read_sql_table(table_name, con, schema=None, index_col=None, + coerce_float=True, parse_dates=None, columns=None, + chunksize=None): """Read SQL database table into a DataFrame. Given a table name and an SQLAlchemy engine, returns a DataFrame. @@ -265,6 +290,9 @@ def read_sql_table(table_name, con, index_col=None, coerce_float=True, Name of SQL table in database con : SQLAlchemy engine Sqlite DBAPI connection mode not supported + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If None, use default schema (default). index_col : string, optional Column to set as index coerce_float : boolean, default True @@ -281,6 +309,9 @@ def read_sql_table(table_name, con, index_col=None, coerce_float=True, such as SQLite columns : list List of column names to select from sql table + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number of + rows to include in each chunk. Returns ------- @@ -297,16 +328,16 @@ def read_sql_table(table_name, con, index_col=None, coerce_float=True, "SQLAlchemy engines.") import sqlalchemy from sqlalchemy.schema import MetaData - meta = MetaData(con) + meta = MetaData(con, schema=schema) try: meta.reflect(only=[table_name]) except sqlalchemy.exc.InvalidRequestError: raise ValueError("Table %s not found" % table_name) - pandas_sql = PandasSQLAlchemy(con, meta=meta) + pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( table_name, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns) + parse_dates=parse_dates, columns=columns, chunksize=chunksize) if table is not None: return table @@ -315,7 +346,7 @@ def read_sql_table(table_name, con, index_col=None, coerce_float=True, def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None): + parse_dates=None, chunksize=None): """Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query @@ -336,7 +367,11 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, Attempt to convert values to non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets params : list, tuple or dict, optional - List of parameters to pass to execute method. + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} parse_dates : list or dict - List of column names to parse as dates - Dict of ``{column_name: format string}`` where format string is @@ -346,6 +381,9 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number of + rows to include in each chunk. Returns ------- @@ -358,13 +396,13 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, """ pandas_sql = pandasSQL_builder(con) - return pandas_sql.read_sql( + return pandas_sql.read_query( sql, index_col=index_col, params=params, coerce_float=coerce_float, - parse_dates=parse_dates) + parse_dates=parse_dates, chunksize=chunksize) def read_sql(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None, columns=None): + parse_dates=None, columns=None, chunksize=None): """ Read SQL query or database table into a DataFrame. @@ -372,7 +410,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, ---------- sql : string SQL query to be executed or database table name. - con : SQLAlchemy engine or DBAPI2 connection (legacy mode) + con : SQLAlchemy engine or DBAPI2 connection (fallback mode) Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -382,7 +420,11 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, Attempt to convert values to non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets params : list, tuple or dict, optional - List of parameters to pass to execute method. + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} parse_dates : list or dict - List of column names to parse as dates - Dict of ``{column_name: format string}`` where format string is @@ -395,6 +437,9 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, columns : list List of column names to select from sql table (only used when reading a table). + chunksize : int, default None + If specified, return an iterator where `chunksize` is the + number of rows to include in each chunk. Returns ------- @@ -415,24 +460,31 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, """ pandas_sql = pandasSQL_builder(con) - if isinstance(pandas_sql, PandasSQLLegacy): - return pandas_sql.read_sql( + if isinstance(pandas_sql, SQLiteDatabase): + return pandas_sql.read_query( sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates) + coerce_float=coerce_float, parse_dates=parse_dates, + chunksize=chunksize) + + try: + _is_table_name = pandas_sql.has_table(sql) + except: + _is_table_name = False - if pandas_sql.has_table(sql): + if _is_table_name: pandas_sql.meta.reflect(only=[sql]) return pandas_sql.read_table( sql, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns) + parse_dates=parse_dates, columns=columns, chunksize=chunksize) else: - return pandas_sql.read_sql( + return pandas_sql.read_query( sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates) + coerce_float=coerce_float, parse_dates=parse_dates, + chunksize=chunksize) -def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True, - index_label=None): +def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', + index=True, index_label=None, chunksize=None): """ Write records stored in a DataFrame to a SQL database. @@ -449,6 +501,9 @@ def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True, The flavor of SQL to use. Ignored when using SQLAlchemy engine. 'mysql' is deprecated and will be removed in future versions, but it will be further supported through SQLAlchemy engines. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If None, use default schema (default). if_exists : {'fail', 'replace', 'append'}, default 'fail' - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. @@ -459,12 +514,15 @@ def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True, Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. """ if if_exists not in ('fail', 'replace', 'append'): raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) - pandas_sql = pandasSQL_builder(con, flavor=flavor) + pandas_sql = pandasSQL_builder(con, schema=schema, flavor=flavor) if isinstance(frame, Series): frame = frame.to_frame() @@ -472,10 +530,11 @@ def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True, raise NotImplementedError pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, - index_label=index_label) + index_label=index_label, schema=schema, + chunksize=chunksize) -def has_table(table_name, con, flavor='sqlite'): +def has_table(table_name, con, flavor='sqlite', schema=None): """ Check if DataBase has named table. @@ -491,12 +550,15 @@ def has_table(table_name, con, flavor='sqlite'): The flavor of SQL to use. Ignored when using SQLAlchemy engine. 'mysql' is deprecated and will be removed in future versions, but it will be further supported through SQLAlchemy engines. + schema : string, default None + Name of SQL schema in database to write to (if database flavor supports + this). If None, use default schema (default). Returns ------- boolean """ - pandas_sql = pandasSQL_builder(con, flavor=flavor) + pandas_sql = pandasSQL_builder(con, flavor=flavor, schema=schema) return pandas_sql.has_table(table_name) table_exists = has_table @@ -506,7 +568,9 @@ def has_table(table_name, con, flavor='sqlite'): "and will be removed in future versions. " "MySQL will be further supported with SQLAlchemy engines.") -def pandasSQL_builder(con, flavor=None, meta=None, is_cursor=False): + +def pandasSQL_builder(con, flavor=None, schema=None, meta=None, + is_cursor=False): """ Convenience function to return the correct PandasSQL subclass based on the provided parameters @@ -514,14 +578,14 @@ def pandasSQL_builder(con, flavor=None, meta=None, is_cursor=False): # When support for DBAPI connections is removed, # is_cursor should not be necessary. if _is_sqlalchemy_engine(con): - return PandasSQLAlchemy(con, meta=meta) + return SQLDatabase(con, schema=schema, meta=meta) else: if flavor == 'mysql': warnings.warn(_MYSQL_WARNING, FutureWarning) - return PandasSQLLegacy(con, flavor, is_cursor=is_cursor) + return SQLiteDatabase(con, flavor, is_cursor=is_cursor) -class PandasSQLTable(PandasObject): +class SQLTable(PandasObject): """ For mapping Pandas tables to SQL tables. Uses fact that table is reflected by SQLAlchemy to @@ -531,58 +595,57 @@ class PandasSQLTable(PandasObject): """ # TODO: support for multiIndex def __init__(self, name, pandas_sql_engine, frame=None, index=True, - if_exists='fail', prefix='pandas', index_label=None): + if_exists='fail', prefix='pandas', index_label=None, + schema=None, keys=None): self.name = name self.pd_sql = pandas_sql_engine self.prefix = prefix self.frame = frame self.index = self._index_name(index, index_label) + self.schema = schema + self.if_exists = if_exists + self.keys = keys if frame is not None: - # We want to write a frame - if self.pd_sql.has_table(self.name): - if if_exists == 'fail': - raise ValueError("Table '%s' already exists." % name) - elif if_exists == 'replace': - self.pd_sql.drop_table(self.name) - self.table = self._create_table_statement() - self.create() - elif if_exists == 'append': - self.table = self.pd_sql.get_table(self.name) - if self.table is None: - self.table = self._create_table_statement() - else: - raise ValueError( - "'{0}' is not valid for if_exists".format(if_exists)) - else: - self.table = self._create_table_statement() - self.create() + # We want to initialize based on a dataframe + self.table = self._create_table_setup() else: # no data provided, read-only mode - self.table = self.pd_sql.get_table(self.name) + self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: raise ValueError("Could not init table '%s'" % name) def exists(self): - return self.pd_sql.has_table(self.name) + return self.pd_sql.has_table(self.name, self.schema) def sql_schema(self): from sqlalchemy.schema import CreateTable return str(CreateTable(self.table)) - def create(self): + def _execute_create(self): + # Inserting table into database, add to MetaData object + self.table = self.table.tometadata(self.pd_sql.meta) self.table.create() + def create(self): + if self.exists(): + if self.if_exists == 'fail': + raise ValueError("Table '%s' already exists." % self.name) + elif self.if_exists == 'replace': + self.pd_sql.drop_table(self.name, self.schema) + self._execute_create() + elif self.if_exists == 'append': + pass + else: + raise ValueError( + "'{0}' is not valid for if_exists".format(self.if_exists)) + else: + self._execute_create() + def insert_statement(self): return self.table.insert() - def maybe_asscalar(self, i): - try: - return np.asscalar(i) - except AttributeError: - return i - def insert_data(self): if self.index is not None: temp = self.frame.copy() @@ -595,22 +658,80 @@ def insert_data(self): else: temp = self.frame - return temp + column_names = list(map(str, temp.columns)) + ncols = len(column_names) + data_list = [None] * ncols + blocks = temp._data.blocks + + for i in range(len(blocks)): + b = blocks[i] + if b.is_datetime: + # convert to microsecond resolution so this yields + # datetime.datetime + d = b.values.astype('M8[us]').astype(object) + else: + d = np.array(b.values, dtype=object) + + # replace NaN with None + if b._can_hold_na: + mask = isnull(d) + d[mask] = None + + for col_loc, col in zip(b.mgr_locs, d): + data_list[col_loc] = col + + return column_names, data_list - def insert(self): - ins = self.insert_statement() - data_list = [] - temp = self.insert_data() - keys = list(map(str, temp.columns)) + def _execute_insert(self, conn, keys, data_iter): + data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter] + conn.execute(self.insert_statement(), data) - for t in temp.itertuples(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in zip(keys, t[1:])) - data_list.append(data) + def insert(self, chunksize=None): + keys, data_list = self.insert_data() - self.pd_sql.execute(ins, data_list) + nrows = len(self.frame) - def read(self, coerce_float=True, parse_dates=None, columns=None): + if nrows == 0: + return + + if chunksize is None: + chunksize = nrows + elif chunksize == 0: + raise ValueError('chunksize argument should be non-zero') + + chunks = int(nrows / chunksize) + 1 + + with self.pd_sql.run_transaction() as conn: + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) + self._execute_insert(conn, keys, chunk_iter) + + def _query_iterator(self, result, chunksize, columns, coerce_float=True, + parse_dates=None): + """Return generator through chunked result set""" + + while True: + data = result.fetchmany(chunksize) + if not data: + break + else: + self.frame = DataFrame.from_records( + data, columns=columns, coerce_float=coerce_float) + + self._harmonize_columns(parse_dates=parse_dates) + + if self.index is not None: + self.frame.set_index(self.index, inplace=True) + + yield self.frame + + def read(self, coerce_float=True, parse_dates=None, columns=None, + chunksize=None): if columns is not None and len(columns) > 0: from sqlalchemy import select @@ -622,18 +743,23 @@ def read(self, coerce_float=True, parse_dates=None, columns=None): sql_select = self.table.select() result = self.pd_sql.execute(sql_select) - data = result.fetchall() column_names = result.keys() - self.frame = DataFrame.from_records( - data, columns=column_names, coerce_float=coerce_float) + if chunksize is not None: + return self._query_iterator(result, chunksize, column_names, + coerce_float=coerce_float, + parse_dates=parse_dates) + else: + data = result.fetchall() + self.frame = DataFrame.from_records( + data, columns=column_names, coerce_float=coerce_float) - self._harmonize_columns(parse_dates=parse_dates) + self._harmonize_columns(parse_dates=parse_dates) - if self.index is not None: - self.frame.set_index(self.index, inplace=True) + if self.index is not None: + self.frame.set_index(self.index, inplace=True) - return self.frame + return self.frame def _index_name(self, index, index_label): # for writing: index=True to include index in sql table @@ -650,7 +776,8 @@ def _index_name(self, index, index_label): else: return index_label # return the used column labels for the index columns - if nlevels == 1 and 'index' not in self.frame.columns and self.frame.index.name is None: + if (nlevels == 1 and 'index' not in self.frame.columns + and self.frame.index.name is None): return ['index'] else: return [l if l is not None else "level_{0}".format(i) @@ -669,38 +796,51 @@ def _get_column_names_and_types(self, dtype_mapper): if self.index is not None: for i, idx_label in enumerate(self.index): idx_type = dtype_mapper( - self.frame.index.get_level_values(i).dtype) - column_names_and_types.append((idx_label, idx_type)) + self.frame.index.get_level_values(i)) + column_names_and_types.append((idx_label, idx_type, True)) + + column_names_and_types += [ + (str(self.frame.columns[i]), + dtype_mapper(self.frame.iloc[:, i]), + False) + for i in range(len(self.frame.columns)) + ] - column_names_and_types += zip( - list(map(str, self.frame.columns)), - map(dtype_mapper, self.frame.dtypes) - ) return column_names_and_types - def _create_table_statement(self): - from sqlalchemy import Table, Column + def _create_table_setup(self): + from sqlalchemy import Table, Column, PrimaryKeyConstraint column_names_and_types = \ self._get_column_names_and_types(self._sqlalchemy_type) - columns = [Column(name, typ) - for name, typ in column_names_and_types] + columns = [Column(name, typ, index=is_index) + for name, typ, is_index in column_names_and_types] + + if self.keys is not None: + pkc = PrimaryKeyConstraint(self.keys, name=self.name + '_pk') + columns.append(pkc) + + schema = self.schema or self.pd_sql.meta.schema - return Table(self.name, self.pd_sql.meta, *columns) + # At this point, attach to new metadata, only attach to self.meta + # once table is created. + from sqlalchemy.schema import MetaData + meta = MetaData(self.pd_sql, schema=schema) + + return Table(self.name, meta, *columns, schema=schema) def _harmonize_columns(self, parse_dates=None): - """ Make a data_frame's column type align with an sql_table - column types - Need to work around limited NA value support. - Floats are always fine, ints must always - be floats if there are Null values. - Booleans are hard because converting bool column with None replaces - all Nones with false. Therefore only convert bool if there are no - NA values. - Datetimes should already be converted - to np.datetime if supported, but here we also force conversion - if required + """ + Make the DataFrame's column types align with the SQL table + column types. + Need to work around limited NA value support. Floats are always + fine, ints must always be floats if there are Null values. + Booleans are hard because converting bool column with None replaces + all Nones with false. Therefore only convert bool if there are no + NA values. + Datetimes should already be converted to np.datetime64 if supported, + but here we also force conversion if required """ # handle non-list entries for parse_dates gracefully if parse_dates is True or parse_dates is None or parse_dates is False: @@ -722,12 +862,12 @@ def _harmonize_columns(self, parse_dates=None): elif col_type is float: # floats support NA, can always convert! - self.frame[col_name].astype(col_type, copy=False) + self.frame[col_name] = df_col.astype(col_type, copy=False) elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools - if col_type is int or col_type is bool: - self.frame[col_name].astype(col_type, copy=False) + if col_type is np.dtype('int64') or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) # Handle date parsing if col_name in parse_dates: @@ -741,30 +881,33 @@ def _harmonize_columns(self, parse_dates=None): except KeyError: pass # this column not in results - def _sqlalchemy_type(self, arr_or_dtype): + def _sqlalchemy_type(self, col): from sqlalchemy.types import (BigInteger, Float, Text, Boolean, - DateTime, Date, Interval) + DateTime, Date, Time) - if arr_or_dtype is date: - return Date - if com.is_datetime64_dtype(arr_or_dtype): + if com.is_datetime64_dtype(col): try: - tz = arr_or_dtype.tzinfo + tz = col.tzinfo return DateTime(timezone=True) except: return DateTime - if com.is_timedelta64_dtype(arr_or_dtype): + if com.is_timedelta64_dtype(col): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) return BigInteger - elif com.is_float_dtype(arr_or_dtype): + elif com.is_float_dtype(col): return Float - elif com.is_integer_dtype(arr_or_dtype): + elif com.is_integer_dtype(col): # TODO: Refine integer size. return BigInteger - elif com.is_bool_dtype(arr_or_dtype): + elif com.is_bool_dtype(col): return Boolean + inferred = lib.infer_dtype(com._ensure_object(col)) + if inferred == 'date': + return Date + if inferred == 'time': + return Time return Text def _numpy_type(self, sqltype): @@ -774,7 +917,7 @@ def _numpy_type(self, sqltype): return float if isinstance(sqltype, Integer): # TODO: Refine integer size. - return int + return np.dtype('int64') if isinstance(sqltype, DateTime): # Caution: np.datetime64 is also a subclass of np.number. return datetime @@ -791,82 +934,234 @@ class PandasSQL(PandasObject): """ def read_sql(self, *args, **kwargs): - raise ValueError( - "PandasSQL must be created with an SQLAlchemy engine or connection+sql flavor") + raise ValueError("PandasSQL must be created with an SQLAlchemy engine" + " or connection+sql flavor") def to_sql(self, *args, **kwargs): - raise ValueError( - "PandasSQL must be created with an SQLAlchemy engine or connection+sql flavor") + raise ValueError("PandasSQL must be created with an SQLAlchemy engine" + " or connection+sql flavor") -class PandasSQLAlchemy(PandasSQL): +class SQLDatabase(PandasSQL): """ This class enables convertion between DataFrame and SQL databases using SQLAlchemy to handle DataBase abstraction + + Parameters + ---------- + engine : SQLAlchemy engine + Engine to connect with the database. Using SQLAlchemy makes it + possible to use any DB supported by that library. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If None, use default schema (default). + meta : SQLAlchemy MetaData object, default None + If provided, this MetaData object is used instead of a newly + created. This allows to specify database flavor specific + arguments in the MetaData object. + """ - def __init__(self, engine, meta=None): + def __init__(self, engine, schema=None, meta=None): self.engine = engine if not meta: from sqlalchemy.schema import MetaData - meta = MetaData(self.engine) + meta = MetaData(self.engine, schema=schema) self.meta = meta + def run_transaction(self): + return self.engine.begin() + def execute(self, *args, **kwargs): """Simple passthrough to SQLAlchemy engine""" return self.engine.execute(*args, **kwargs) def read_table(self, table_name, index_col=None, coerce_float=True, - parse_dates=None, columns=None): + parse_dates=None, columns=None, schema=None, + chunksize=None): + """Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : string + Name of SQL table in database + index_col : string, optional + Column to set as index + coerce_float : boolean, default True + Attempt to convert values to non-string, non-numeric objects + (like decimal.Decimal) to floating point. This can result in + loss of precision. + parse_dates : list or dict + - List of column names to parse as dates + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite + columns : list + List of column names to select from sql table + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number + of rows to include in each chunk. + + Returns + ------- + DataFrame + + See also + -------- + pandas.read_sql_table + SQLDatabase.read_query - table = PandasSQLTable(table_name, self, index=index_col) + """ + table = SQLTable(table_name, self, index=index_col, schema=schema) return table.read(coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns) + parse_dates=parse_dates, columns=columns, + chunksize=chunksize) + + @staticmethod + def _query_iterator(result, chunksize, columns, index_col=None, + coerce_float=True, parse_dates=None): + """Return generator through chunked result set""" + + while True: + data = result.fetchmany(chunksize) + if not data: + break + else: + yield _wrap_result(data, columns, index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) - def read_sql(self, sql, index_col=None, coerce_float=True, - parse_dates=None, params=None): + def read_query(self, sql, index_col=None, coerce_float=True, + parse_dates=None, params=None, chunksize=None): + """Read SQL query into a DataFrame. + + Parameters + ---------- + sql : string + SQL query to be executed + index_col : string, optional + Column name to use as index for the returned DataFrame object. + coerce_float : boolean, default True + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets + params : list, tuple or dict, optional + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} + parse_dates : list or dict + - List of column names to parse as dates + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite + + Returns + ------- + DataFrame + + See also + -------- + read_sql_table : Read SQL database table into a DataFrame + read_sql + + """ args = _convert_params(sql, params) result = self.execute(*args) - data = result.fetchall() columns = result.keys() - data_frame = DataFrame.from_records( - data, columns=columns, coerce_float=coerce_float) - - _parse_date_columns(data_frame, parse_dates) - - if index_col is not None: - data_frame.set_index(index_col, inplace=True) + if chunksize is not None: + return self._query_iterator(result, chunksize, columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) + else: + data = result.fetchall() + frame = _wrap_result(data, columns, index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) + return frame - return data_frame + read_sql = read_query def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None): - table = PandasSQLTable( - name, self, frame=frame, index=index, if_exists=if_exists, - index_label=index_label) - table.insert() + index_label=None, schema=None, chunksize=None): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. + + """ + table = SQLTable(name, self, frame=frame, index=index, + if_exists=if_exists, index_label=index_label, + schema=schema) + table.create() + table.insert(chunksize) + # check for potentially case sensitivity issues (GH7815) + if name not in self.engine.table_names(schema=schema or self.meta.schema): + warnings.warn("The provided table name '{0}' is not found exactly " + "as such in the database after writing the table, " + "possibly due to case sensitivity issues. Consider " + "using lower case table names.".format(name), UserWarning) @property def tables(self): return self.meta.tables - def has_table(self, name): - return self.engine.has_table(name) + def has_table(self, name, schema=None): + return self.engine.has_table(name, schema or self.meta.schema) - def get_table(self, table_name): - return self.meta.tables.get(table_name) + def get_table(self, table_name, schema=None): + schema = schema or self.meta.schema + if schema: + return self.meta.tables.get('.'.join([schema, table_name])) + else: + return self.meta.tables.get(table_name) - def drop_table(self, table_name): - if self.engine.has_table(table_name): - self.meta.reflect(only=[table_name]) - self.get_table(table_name).drop() + def drop_table(self, table_name, schema=None): + schema = schema or self.meta.schema + if self.engine.has_table(table_name, schema): + self.meta.reflect(only=[table_name], schema=schema) + self.get_table(table_name, schema).drop() self.meta.clear() - def _create_sql_schema(self, frame, table_name): - table = PandasSQLTable(table_name, self, frame=frame) + def _create_sql_schema(self, frame, table_name, keys=None): + table = SQLTable(table_name, self, frame=frame, index=False, keys=keys) return str(table.sql_schema()) @@ -893,7 +1188,11 @@ def _create_sql_schema(self, frame, table_name): }, 'date': { 'mysql': 'DATE', - 'sqlite': 'TIMESTAMP', + 'sqlite': 'DATE', + }, + 'time': { + 'mysql': 'TIME', + 'sqlite': 'TIME', }, 'bool': { 'mysql': 'BOOLEAN', @@ -921,15 +1220,19 @@ def _create_sql_schema(self, frame, table_name): "underscores.") -class PandasSQLTableLegacy(PandasSQLTable): - """Patch the PandasSQLTable for legacy support. - Instead of a table variable just use the Create Table - statement""" +class SQLiteTable(SQLTable): + """ + Patch the SQLTable for fallback support. + Instead of a table variable just use the Create Table statement. + """ + def sql_schema(self): - return str(self.table) + return str(";\n".join(self.table)) - def create(self): - self.pd_sql.execute(self.table) + def _execute_create(self): + with self.pd_sql.run_transaction() as conn: + for stmt in self.table: + conn.execute(stmt) def insert_statement(self): names = list(map(str, self.frame.columns)) @@ -948,28 +1251,21 @@ def insert_statement(self): self.name, col_names, wildcards) return insert_statement - def insert(self): - ins = self.insert_statement() - temp = self.insert_data() - data_list = [] - - for t in temp.itertuples(): - data = tuple((self.maybe_asscalar(v) for v in t[1:])) - data_list.append(data) - - cur = self.pd_sql.con.cursor() - cur.executemany(ins, data_list) - cur.close() - self.pd_sql.con.commit() - - def _create_table_statement(self): - "Return a CREATE TABLE statement to suit the contents of a DataFrame." + def _execute_insert(self, conn, keys, data_iter): + data_list = list(data_iter) + conn.executemany(self.insert_statement(), data_list) + def _create_table_setup(self): + """ + Return a list of SQL statement that create a table reflecting the + structure of a DataFrame. The first entry will be a CREATE TABLE + statement while the rest will be CREATE INDEX statements + """ column_names_and_types = \ self._get_column_names_and_types(self._sql_type_name) pat = re.compile('\s+') - column_names = [col_name for col_name, _ in column_names_and_types] + column_names = [col_name for col_name, _, _ in column_names_and_types] if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING) @@ -978,18 +1274,30 @@ def _create_table_statement(self): br_l = _SQL_SYMB[flv]['br_l'] # left val quote char br_r = _SQL_SYMB[flv]['br_r'] # right val quote char - col_template = br_l + '%s' + br_r + ' %s' - - columns = ',\n '.join(col_template % - x for x in column_names_and_types) - template = """CREATE TABLE %(name)s ( - %(columns)s - )""" - create_statement = template % {'name': self.name, 'columns': columns} - return create_statement - - def _sql_type_name(self, dtype): - pytype = dtype.type + create_tbl_stmts = [(br_l + '%s' + br_r + ' %s') % (cname, ctype) + for cname, ctype, _ in column_names_and_types] + if self.keys is not None and len(self.keys): + cnames_br = ",".join([br_l + c + br_r for c in self.keys]) + create_tbl_stmts.append( + "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( + tbl=self.name, cnames_br=cnames_br)) + + create_stmts = ["CREATE TABLE " + self.name + " (\n" + + ',\n '.join(create_tbl_stmts) + "\n)"] + + ix_cols = [cname for cname, _, is_index in column_names_and_types + if is_index] + if len(ix_cols): + cnames = "_".join(ix_cols) + cnames_br = ",".join([br_l + c + br_r for c in ix_cols]) + create_stmts.append( + "CREATE INDEX ix_{tbl}_{cnames} ON {tbl} ({cnames_br})".format( + tbl=self.name, cnames=cnames, cnames_br=cnames_br)) + + return create_stmts + + def _sql_type_name(self, col): + pytype = col.dtype.type pytype_name = "text" if issubclass(pytype, np.floating): pytype_name = "float" @@ -1003,15 +1311,31 @@ def _sql_type_name(self, dtype): elif issubclass(pytype, np.datetime64) or pytype is datetime: # Caution: np.datetime64 is also a subclass of np.number. pytype_name = "datetime" - elif pytype is datetime.date: - pytype_name = "date" elif issubclass(pytype, np.bool_): pytype_name = "bool" + elif issubclass(pytype, np.object): + pytype = lib.infer_dtype(com._ensure_object(col)) + if pytype == "date": + pytype_name = "date" + elif pytype == "time": + pytype_name = "time" return _SQL_TYPES[pytype_name][self.pd_sql.flavor] -class PandasSQLLegacy(PandasSQL): +class SQLiteDatabase(PandasSQL): + """ + Version of SQLDatabase to support sqlite connections (fallback without + sqlalchemy). This should only be used internally. + + For now still supports `flavor` argument to deal with 'mysql' database + for backwards compatibility, but this will be removed in future versions. + + Parameters + ---------- + con : sqlite connection object + + """ def __init__(self, con, flavor, is_cursor=False): self.is_cursor = is_cursor @@ -1023,6 +1347,18 @@ def __init__(self, con, flavor, is_cursor=False): else: self.flavor = flavor + @contextmanager + def run_transaction(self): + cur = self.con.cursor() + try: + yield cur + self.con.commit() + except: + self.con.rollback() + raise + finally: + cur.close() + def execute(self, *args, **kwargs): if self.is_cursor: cur = self.con @@ -1034,33 +1370,52 @@ def execute(self, *args, **kwargs): else: cur.execute(*args) return cur - except Exception as e: + except Exception as exc: try: self.con.rollback() except Exception: # pragma: no cover - ex = DatabaseError( - "Execution failed on sql: %s\n%s\nunable to rollback" % (args[0], e)) + ex = DatabaseError("Execution failed on sql: %s\n%s\nunable" + " to rollback" % (args[0], exc)) raise_with_traceback(ex) - ex = DatabaseError("Execution failed on sql: %s" % args[0]) + ex = DatabaseError("Execution failed on sql '%s': %s" % (args[0], exc)) raise_with_traceback(ex) - def read_sql(self, sql, index_col=None, coerce_float=True, params=None, - parse_dates=None): + @staticmethod + def _query_iterator(cursor, chunksize, columns, index_col=None, + coerce_float=True, parse_dates=None): + """Return generator through chunked result set""" + + while True: + data = cursor.fetchmany(chunksize) + if not data: + cursor.close() + break + else: + yield _wrap_result(data, columns, index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) + + def read_query(self, sql, index_col=None, coerce_float=True, params=None, + parse_dates=None, chunksize=None): + args = _convert_params(sql, params) cursor = self.execute(*args) columns = [col_desc[0] for col_desc in cursor.description] - data = self._fetchall_as_list(cursor) - cursor.close() - - data_frame = DataFrame.from_records( - data, columns=columns, coerce_float=coerce_float) - _parse_date_columns(data_frame, parse_dates) + if chunksize is not None: + return self._query_iterator(cursor, chunksize, columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) + else: + data = self._fetchall_as_list(cursor) + cursor.close() - if index_col is not None: - data_frame.set_index(index_col, inplace=True) - return data_frame + frame = _wrap_result(data, columns, index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates) + return frame def _fetchall_as_list(self, cur): result = cur.fetchall() @@ -1069,7 +1424,7 @@ def _fetchall_as_list(self, cur): return result def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None): + index_label=None, schema=None, chunksize=None): """ Write records stored in a DataFrame to a SQL database. @@ -1077,19 +1432,30 @@ def to_sql(self, frame, name, if_exists='fail', index=True, ---------- frame: DataFrame name: name of SQL table - flavor: {'sqlite', 'mysql'}, default 'sqlite' if_exists: {'fail', 'replace', 'append'}, default 'fail' fail: If table exists, do nothing. replace: If table exists, drop it, recreate it, and insert data. append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Ignored parameter included for compatability with SQLAlchemy + version of ``to_sql``. + chunksize : int, default None + If not None, then rows will be written in batches of this + size at a time. If None, all rows will be written at once. """ - table = PandasSQLTableLegacy( - name, self, frame=frame, index=index, if_exists=if_exists, - index_label=index_label) - table.insert() + table = SQLiteTable(name, self, frame=frame, index=index, + if_exists=if_exists, index_label=index_label) + table.create() + table.insert(chunksize) - def has_table(self, name): + def has_table(self, name, schema=None): flavor_map = { 'sqlite': ("SELECT name FROM sqlite_master " "WHERE type='table' AND name='%s';") % name, @@ -1098,15 +1464,16 @@ def has_table(self, name): return len(self.execute(query).fetchall()) > 0 - def get_table(self, table_name): - return None # not supported in Legacy mode + def get_table(self, table_name, schema=None): + return None # not supported in fallback mode - def drop_table(self, name): + def drop_table(self, name, schema=None): drop_sql = "DROP TABLE %s" % name self.execute(drop_sql) - def _create_sql_schema(self, frame, table_name): - table = PandasSQLTableLegacy(table_name, self, frame=frame) + def _create_sql_schema(self, frame, table_name, keys=None): + table = SQLiteTable(table_name, self, frame=frame, index=False, + keys=keys) return str(table.sql_schema()) @@ -1132,58 +1499,8 @@ def get_schema(frame, name, flavor='sqlite', keys=None, con=None): """ - if con is None: - if flavor == 'mysql': - warnings.warn(_MYSQL_WARNING, FutureWarning) - return _get_schema_legacy(frame, name, flavor, keys) - pandas_sql = pandasSQL_builder(con=con, flavor=flavor) - return pandas_sql._create_sql_schema(frame, name) - - -def _get_schema_legacy(frame, name, flavor, keys=None): - """Old function from 0.13.1. To keep backwards compatibility. - When mysql legacy support is dropped, it should be possible to - remove this code - """ - - def get_sqltype(dtype, flavor): - pytype = dtype.type - pytype_name = "text" - if issubclass(pytype, np.floating): - pytype_name = "float" - elif issubclass(pytype, np.integer): - pytype_name = "int" - elif issubclass(pytype, np.datetime64) or pytype is datetime: - # Caution: np.datetime64 is also a subclass of np.number. - pytype_name = "datetime" - elif pytype is datetime.date: - pytype_name = "date" - elif issubclass(pytype, np.bool_): - pytype_name = "bool" - - return _SQL_TYPES[pytype_name][flavor] - - lookup_type = lambda dtype: get_sqltype(dtype, flavor) - - column_types = lzip(frame.dtypes.index, map(lookup_type, frame.dtypes)) - if flavor == 'sqlite': - columns = ',\n '.join('[%s] %s' % x for x in column_types) - else: - columns = ',\n '.join('`%s` %s' % x for x in column_types) - - keystr = '' - if keys is not None: - if isinstance(keys, string_types): - keys = (keys,) - keystr = ', PRIMARY KEY (%s)' % ','.join(keys) - template = """CREATE TABLE %(name)s ( - %(columns)s - %(keystr)s - );""" - create_statement = template % {'name': name, 'columns': columns, - 'keystr': keystr} - return create_statement + return pandas_sql._create_sql_schema(frame, name, keys=keys) # legacy names, with depreciation warnings and copied docs diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5b5ce3e59e16e..c2542594861c4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,25 +9,27 @@ You can find more information on http://presbrey.mit.edu/PyDTA and http://statsmodels.sourceforge.net/devel/ """ -# TODO: Fix this module so it can use cross-compatible zip, map, and range import numpy as np import sys import struct +from dateutil.relativedelta import relativedelta from pandas.core.base import StringMixin from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.categorical import Categorical import datetime -from pandas import compat -from pandas.compat import long, lrange, lmap, lzip, text_type, string_types -from pandas import isnull +from pandas import compat, to_timedelta, to_datetime, isnull, DatetimeIndex +from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ + zip +import pandas.core.common as com from pandas.io.common import get_filepath_or_buffer -from pandas.lib import max_len_string_array, is_string_array -from pandas.tslib import NaT +from pandas.lib import max_len_string_array, infer_dtype +from pandas.tslib import NaT, Timestamp def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index=None): + convert_categoricals=True, encoding=None, index=None, + convert_missing=False, preserve_dtypes=True, columns=None): """ Read Stata file into DataFrame @@ -44,29 +46,54 @@ def read_stata(filepath_or_buffer, convert_dates=True, support unicode. None defaults to cp1252. index : identifier of index column identifier of column that should be used as index of the DataFrame + convert_missing : boolean, defaults to False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nans. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : boolean, defaults to True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64) + columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns """ reader = StataReader(filepath_or_buffer, encoding) - return reader.data(convert_dates, convert_categoricals, index) + return reader.data(convert_dates, convert_categoricals, index, + convert_missing, preserve_dtypes, columns) _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] -def _stata_elapsed_date_to_datetime(date, fmt): +stata_epoch = datetime.datetime(1960, 1, 1) + + +def _stata_elapsed_date_to_datetime_vec(dates, fmt): """ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime Parameters ---------- - date : int + dates : Series The Stata Internal Format date to convert to datetime according to fmt fmt : str The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + Returns + + Returns + ------- + converted : Series + The converted dates Examples -------- - >>> _stata_elapsed_date_to_datetime(52, "%tw") - datetime.datetime(1961, 1, 1, 0, 0) + >>> import pandas as pd + >>> dates = pd.Series([52]) + >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") + 0 1961-01-01 + dtype: datetime64[ns] Notes ----- @@ -93,94 +120,207 @@ def _stata_elapsed_date_to_datetime(date, fmt): If you don't have pandas with datetime support, then you can't do milliseconds accurately. """ - #NOTE: we could run into overflow / loss of precision situations here - # casting to int, but I'm not sure what to do. datetime won't deal with - # numpy types and numpy datetime isn't mature enough / we can't rely on - # pandas version > 0.7.1 - #TODO: IIRC relative delta doesn't play well with np.datetime? - #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly - if np.isnan(date): - return NaT - - date = int(date) - stata_epoch = datetime.datetime(1960, 1, 1) - if fmt in ["%tc", "tc"]: - from dateutil.relativedelta import relativedelta - return stata_epoch + relativedelta(microseconds=date * 1000) + MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year + MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days + MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days + MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 + MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 + + def convert_year_month_safe(year, month): + """ + Convert year and month to datetimes, using pandas vectorized versions + when the date range falls within the range supported by pandas. Other + wise it falls back to a slower but more robust method using datetime. + """ + if year.max() < MAX_YEAR and year.min() > MIN_YEAR: + return to_datetime(100 * year + month, format='%Y%m') + else: + return Series( + [datetime.datetime(y, m, 1) for y, m in zip(year, month)]) + + def convert_year_days_safe(year, days): + """ + Converts year (e.g. 1999) and days since the start of the year to a + datetime or datetime64 Series + """ + if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: + return to_datetime(year, format='%Y') + to_timedelta(days, unit='d') + else: + value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for + y, d in zip(year, days)] + return Series(value) + + def convert_delta_safe(base, deltas, unit): + """ + Convert base dates and deltas to datetimes, using pandas vectorized + versions if the deltas satisfy restrictions required to be expressed + as dates in pandas. + """ + if unit == 'd': + if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: + values = [base + relativedelta(days=int(d)) for d in deltas] + return Series(values) + elif unit == 'ms': + if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: + values = [base + relativedelta(microseconds=(int(d) * 1000)) for + d in deltas] + return Series(values) + else: + raise ValueError('format not understood') + + base = to_datetime(base) + deltas = to_timedelta(deltas, unit=unit) + return base + deltas + + # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly + bad_locs = np.isnan(dates) + has_bad_values = False + if bad_locs.any(): + has_bad_values = True + data_col = Series(dates) + data_col[bad_locs] = 1.0 # Replace with NaT + dates = dates.astype(np.int64) + + if fmt in ["%tc", "tc"]: # Delta ms relative to base + base = stata_epoch + ms = dates + conv_dates = convert_delta_safe(base, ms, 'ms') elif fmt in ["%tC", "tC"]: from warnings import warn + warn("Encountered %tC format. Leaving in Stata Internal Format.") - return date - elif fmt in ["%td", "td", "%d", "d"]: - return stata_epoch + datetime.timedelta(int(date)) + conv_dates = Series(dates, dtype=np.object) + if has_bad_values: + conv_dates[bad_locs] = np.nan + return conv_dates + elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + base = stata_epoch + days = dates + conv_dates = convert_delta_safe(base, days, 'd') elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week - year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) - day_delta = (date % 52) * 7 - return year + datetime.timedelta(int(day_delta)) - elif fmt in ["%tm", "tm"]: - year = stata_epoch.year + date // 12 - month_delta = (date % 12) + 1 - return datetime.datetime(year, month_delta, 1) - elif fmt in ["%tq", "tq"]: - year = stata_epoch.year + date // 4 - month_delta = (date % 4) * 3 + 1 - return datetime.datetime(year, month_delta, 1) - elif fmt in ["%th", "th"]: - year = stata_epoch.year + date // 2 - month_delta = (date % 2) * 6 + 1 - return datetime.datetime(year, month_delta, 1) - elif fmt in ["%ty", "ty"]: - if date > 0: - return datetime.datetime(date, 1, 1) - else: # don't do negative years bc can't mix dtypes in column - raise ValueError("Year 0 and before not implemented") + year = stata_epoch.year + dates // 52 + days = (dates % 52) * 7 + conv_dates = convert_year_days_safe(year, days) + elif fmt in ["%tm", "tm"]: # Delta months relative to base + year = stata_epoch.year + dates // 12 + month = (dates % 12) + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + year = stata_epoch.year + dates // 4 + month = (dates % 4) * 3 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%th", "th"]: # Delta half-years relative to base + year = stata_epoch.year + dates // 2 + month = (dates % 2) * 6 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt in ["%ty", "ty"]: # Years -- not delta + year = dates + month = np.ones_like(dates) + conv_dates = convert_year_month_safe(year, month) else: raise ValueError("Date fmt %s not understood" % fmt) + if has_bad_values: # Restore NaT for bad values + conv_dates[bad_locs] = NaT + return conv_dates + -def _datetime_to_stata_elapsed(date, fmt): +def _datetime_to_stata_elapsed_vec(dates, fmt): """ Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime Parameters ---------- - date : datetime.datetime - The date to convert to the Stata Internal Format given by fmt + dates : Series + Series or array containing datetime.datetime or datetime64[ns] to + convert to the Stata Internal Format given by fmt fmt : str The format to convert to. Can be, tc, td, tw, tm, tq, th, ty """ - if not isinstance(date, datetime.datetime): - raise ValueError("date should be datetime.datetime format") - stata_epoch = datetime.datetime(1960, 1, 1) - # Handle NaTs - if date is NaT: - # Missing value for dates ('.'), assumed always double - # TODO: Should be moved so a const somewhere, and consolidated - return struct.unpack(' 6) + d = parse_dates_safe(dates, year=True) + conv_dates = 2 * (d.year - stata_epoch.year) + \ + (d.month > 6).astype(np.int) elif fmt in ["%ty", "ty"]: - return date.year + d = parse_dates_safe(dates, year=True) + conv_dates = d.year else: raise ValueError("fmt %s not understood" % fmt) + conv_dates = Series(conv_dates, dtype=np.float64) + missing_value = struct.unpack('= 2 * 53: + if data[col].max() >= 2 ** 53: ws = precision_loss_doc % ('uint64', 'float64') data[col] = data[col].astype(dtype) @@ -291,35 +431,76 @@ class StataMissingValue(StringMixin): Parameters ----------- - offset - value + value : int8, int16, int32, float32 or float64 + The Stata missing value code Attributes ---------- - string - value + string : string + String representation of the Stata missing value + value : int8, int16, int32, float32 or float64 + The original encoded missing value Notes ----- More information: + + Integer missing values make the code '.', '.a', ..., '.z' to the ranges + 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... + 2147483647 (for int32). Missing values for floating point data types are + more complex but the pattern is simple to discern from the following table. + + np.float32 missing values (float in Stata) + 0000007f . + 0008007f .a + 0010007f .b + ... + 00c0007f .x + 00c8007f .y + 00d0007f .z + + np.float64 missing values (double in Stata) + 000000000000e07f . + 000000000001e07f .a + 000000000002e07f .b + ... + 000000000018e07f .x + 000000000019e07f .y + 00000000001ae07f .z """ - # TODO: Needs test - def __init__(self, offset, value): + + # Construct a dictionary of missing values + MISSING_VALUES = {} + bases = (101, 32741, 2147483621) + for b in bases: + MISSING_VALUES[b] = '.' + for i in range(1, 27): + MISSING_VALUES[i + b] = '.' + chr(96 + i) + + base = b'\x00\x00\x00\x7f' + increment = struct.unpack(' 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack(' 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack('q', struct.pack(' nmax: - if self._missing_values: - return StataMissingValue(nmax, d) - else: - return None - return d - def _null_terminate(self, s): if compat.PY3 or self._encoding is not None: # have bytes not strings, # so must decode @@ -737,56 +913,6 @@ def _null_terminate(self, s): except: return s - def _next(self): - typlist = self.typlist - if self.has_string_data: - data = [None] * self.nvar - for i in range(len(data)): - if type(typlist[i]) is int: - data[i] = self._null_terminate( - self.path_or_buf.read(typlist[i]) - ) - else: - data[i] = self._unpack( - typlist[i], self.path_or_buf.read(self._col_size(i)) - ) - return data - else: - return list( - map( - lambda i: self._unpack(typlist[i], - self.path_or_buf.read( - self._col_size(i) - )), - range(self.nvar) - ) - ) - - def _dataset(self): - """ - Returns a Python generator object for iterating over the dataset. - - - Parameters - ---------- - - Returns - ------- - Generator object for iterating over the dataset. Yields each row of - observations as a list by default. - - Notes - ----- - If missing_values is True during instantiation of StataReader then - observations with _StataMissingValue(s) are not filtered and should - be handled by your applcation. - """ - - self.path_or_buf.seek(self.data_location) - - for i in range(self.nobs): - yield self._next() - def _read_value_labels(self): if self.format_version >= 117: self.path_or_buf.seek(self.seek_value_labels) @@ -853,7 +979,8 @@ def _read_strls(self): self.GSO[v_o] = self.path_or_buf.read(length-1) self.path_or_buf.read(1) # zero-termination - def data(self, convert_dates=True, convert_categoricals=True, index=None): + def data(self, convert_dates=True, convert_categoricals=True, index=None, + convert_missing=False, preserve_dtypes=True, columns=None): """ Reads observations from Stata file, converting them into a dataframe @@ -866,11 +993,24 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): variables index : identifier of index column identifier of column that should be used as index of the DataFrame + convert_missing : boolean, defaults to False + Flag indicating whether to convert missing values to their Stata + representation. If False, missing values are replaced with + nans. If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. + preserve_dtypes : boolean, defaults to True + Preserve Stata datatypes. If False, numeric data are upcast to + pandas default types for foreign data (float64 or int64) + columns : list or None + Columns to retain. Columns will be returned in the given order. + None returns all columns Returns ------- y : DataFrame instance """ + self._missing_values = convert_missing if self._data_read: raise Exception("Data has already been read.") self._data_read = True @@ -878,18 +1018,19 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): if self.format_version >= 117: self._read_strls() - stata_dta = self._dataset() - - data = [] - for rownum, line in enumerate(stata_dta): - # doesn't handle missing value objects, just casts - # None will only work without missing value object. - for i, val in enumerate(line): - #NOTE: This will only be scalar types because missing strings - # are empty not None in Stata - if val is None: - line[i] = np.nan - data.append(tuple(line)) + # Read data + count = self.nobs + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + read_len = count * dtype.itemsize + self.path_or_buf.seek(self.data_location) + data = np.frombuffer(self.path_or_buf.read(read_len),dtype=dtype,count=count) + self._data_read = True if convert_categoricals: self._read_value_labels() @@ -897,23 +1038,99 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): if len(data)==0: data = DataFrame(columns=self.varlist, index=index) else: - data = DataFrame(data, columns=self.varlist, index=index) + data = DataFrame.from_records(data, index=index) + data.columns = self.varlist + + if columns is not None: + column_set = set(columns) + if len(column_set) != len(columns): + raise ValueError('columns contains duplicate entries') + unmatched = column_set.difference(data.columns) + if unmatched: + raise ValueError('The following columns were not found in the ' + 'Stata data set: ' + + ', '.join(list(unmatched))) + # Copy information for retained columns for later processing + dtyplist = [] + typlist = [] + fmtlist = [] + lbllist = [] + matched = set() + for i, col in enumerate(data.columns): + if col in column_set: + matched.update([col]) + dtyplist.append(self.dtyplist[i]) + typlist.append(self.typlist[i]) + fmtlist.append(self.fmtlist[i]) + lbllist.append(self.lbllist[i]) + + data = data[columns] + self.dtyplist = dtyplist + self.typlist = typlist + self.fmtlist = fmtlist + self.lbllist = lbllist + + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply(self._null_terminate, convert_dtype=True,) cols_ = np.where(self.dtyplist)[0] + + # Convert columns (if needed) to match input type + index = data.index + requires_type_conversion = False + data_formatted = [] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] - if data[col].dtype is not np.dtype(object): - data[col] = Series(data[col], data[col].index, - self.dtyplist[i]) + dtype = data[col].dtype + if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + requires_type_conversion = True + data_formatted.append((col, Series(data[col], index, self.dtyplist[i]))) + else: + data_formatted.append((col, data[col])) + if requires_type_conversion: + data = DataFrame.from_items(data_formatted) + del data_formatted + + # Check for missing values, and replace if found + for i, colname in enumerate(data): + fmt = self.typlist[i] + if fmt not in self.VALID_RANGE: + continue + + nmin, nmax = self.VALID_RANGE[fmt] + series = data[colname] + missing = np.logical_or(series < nmin, series > nmax) + + if not missing.any(): + continue + + if self._missing_values: # Replacement follows Stata notation + missing_loc = np.argwhere(missing) + umissing, umissing_loc = np.unique(series[missing], + return_inverse=True) + replacement = Series(series, dtype=np.object) + for i, um in enumerate(umissing): + missing_value = StataMissingValue(um) + + loc = missing_loc[umissing_loc == i] + replacement.iloc[loc] = missing_value + else: # All replacements are identical + dtype = series.dtype + if dtype not in (np.float32, np.float64): + dtype = np.float64 + replacement = Series(series, dtype=dtype) + replacement[missing] = np.nan + + data[colname] = replacement if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = data[col].apply(_stata_elapsed_date_to_datetime, - args=(self.fmtlist[i],)) + data[col] = _stata_elapsed_date_to_datetime_vec(data[col], self.fmtlist[i]) if convert_categoricals: cols = np.where( @@ -929,6 +1146,21 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.float16, np.float32): + dtype = np.float64 + convert = True + elif dtype in (np.int8, np.int16, np.int32): + dtype = np.int64 + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_items(retyped_data) + return data def data_label(self): @@ -1051,9 +1283,8 @@ def _dtype_to_default_stata_fmt(dtype, column): Maps numpy dtype to stata's default format for this type. Not terribly important since users can change this in Stata. Semantics are - string -> "%DDs" where DD is the length of the string - object -> "%DDs" where DD is the length of the string, if a string, or 244 - for anything that cannot be converted to a string. + object -> "%DDs" where DD is the length of the string. If not a string, + raise ValueError float64 -> "%10.0g" float32 -> "%9.0g" int64 -> "%9.0g" @@ -1061,19 +1292,13 @@ def _dtype_to_default_stata_fmt(dtype, column): int16 -> "%8.0g" int8 -> "%8.0g" """ - #TODO: expand this to handle a default datetime format? - if dtype.type == np.string_: - if max_len_string_array(column.values) > 244: - raise ValueError(excessive_string_length_error % column.name) - - return "%" + str(dtype.itemsize) + "s" - elif dtype.type == np.object_: - try: - # Try to use optimal size if available - itemsize = max_len_string_array(column.values) - except: - # Default size - itemsize = 244 + # TODO: expand this to handle a default datetime format? + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column.dropna()) + if not (inferred_dtype in ('string', 'unicode') + or len(column) == 0): + raise ValueError('Writing general object arrays is not supported') + itemsize = max_len_string_array(column.values) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) @@ -1125,12 +1350,15 @@ class StataWriter(StataParser): Examples -------- + >>> import pandas as pd + >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) >>> writer = StataWriter('./data_file.dta', data) >>> writer.write_file() Or with dates - - >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) + >>> from datetime import datetime + >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) + >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'}) >>> writer.write_file() """ def __init__(self, fname, data, convert_dates=None, write_index=True, @@ -1299,11 +1527,8 @@ def write_file(self): self._write_variable_labels() # write 5 zeros for expansion fields self._write(_pad_bytes("", 5)) - if self._convert_dates is None: - self._write_data_nodates() - else: - self._write_data_dates() - #self._write_value_labels() + self._prepare_data() + self._write_data() self._file.close() def _write_header(self, data_label=None, time_stamp=None): @@ -1370,59 +1595,46 @@ def _write_variable_labels(self, labels=None): for i in range(nvar): self._write(_pad_bytes("", 81)) - def _write_data_nodates(self): - data = self.datarows - byteorder = self._byteorder - TYPE_MAP = self.TYPE_MAP + def _prepare_data(self): + data = self.data.copy() typlist = self.typlist - for row in data: - #row = row.squeeze().tolist() # needed for structured arrays - for i, var in enumerate(row): - typ = ord(typlist[i]) - if typ <= 244: # we've got a string - if var is None or var == np.nan: - var = _pad_bytes('', typ) - if len(var) < typ: - var = _pad_bytes(var, typ) - if compat.PY3: - self._write(var) - else: - self._write(var.encode(self._encoding)) - else: - try: - self._file.write(struct.pack(byteorder + TYPE_MAP[typ], - var)) - except struct.error: - # have to be strict about type pack won't do any - # kind of casting - self._file.write(struct.pack(byteorder+TYPE_MAP[typ], - self.type_converters[typ](var))) - - def _write_data_dates(self): convert_dates = self._convert_dates - data = self.datarows - byteorder = self._byteorder - TYPE_MAP = self.TYPE_MAP - MISSING_VALUES = self.MISSING_VALUES - typlist = self.typlist - for row in data: - #row = row.squeeze().tolist() # needed for structured arrays - for i, var in enumerate(row): - typ = ord(typlist[i]) - #NOTE: If anyone finds this terribly slow, there is - # a vectorized way to convert dates, see genfromdta for going - # from int to datetime and reverse it. will copy data though + # 1. Convert dates + if self._convert_dates is not None: + for i, col in enumerate(data): if i in convert_dates: - var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) - if typ <= 244: # we've got a string - if len(var) < typ: - var = _pad_bytes(var, typ) - if compat.PY3: - self._write(var) - else: - self._write(var.encode(self._encoding)) - else: - self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) + data[col] = _datetime_to_stata_elapsed_vec(data[col], + self.fmtlist[i]) + + # 2. Convert bad string data to '' and pad to correct length + dtype = [] + data_cols = [] + has_strings = False + for i, col in enumerate(data): + typ = ord(typlist[i]) + if typ <= 244: + has_strings = True + data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,)) + stype = 'S%d' % typ + dtype.append(('c'+str(i), stype)) + string = data[col].str.encode(self._encoding) + data_cols.append(string.values.astype(stype)) + else: + dtype.append(('c'+str(i), data[col].dtype)) + data_cols.append(data[col].values) + dtype = np.dtype(dtype) + + # 3. Convert to record array + + # data.to_records(index=False, convert_datetime64=False) + if has_strings: + self.data = np.fromiter(zip(*data_cols), dtype=dtype) + else: + self.data = data.to_records(index=False) + + def _write_data(self): + data = self.data + data.tofile(self._file) def _null_terminate(self, s, as_string=False): null_byte = '\x00' diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle new file mode 100644 index 0000000000000..d7d20b06df305 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_linux_2.7.8.pickle differ diff --git a/pandas/io/tests/data/stata8_113.dta b/pandas/io/tests/data/stata8_113.dta new file mode 100644 index 0000000000000..9b0831746025e Binary files /dev/null and b/pandas/io/tests/data/stata8_113.dta differ diff --git a/pandas/io/tests/data/stata8_115.dta b/pandas/io/tests/data/stata8_115.dta new file mode 100644 index 0000000000000..bb78368b3462b Binary files /dev/null and b/pandas/io/tests/data/stata8_115.dta differ diff --git a/pandas/io/tests/data/stata8_117.dta b/pandas/io/tests/data/stata8_117.dta new file mode 100644 index 0000000000000..fcfa7abd7b0d9 Binary files /dev/null and b/pandas/io/tests/data/stata8_117.dta differ diff --git a/pandas/io/tests/data/stata9_115.dta b/pandas/io/tests/data/stata9_115.dta new file mode 100644 index 0000000000000..5ad6cd6a2c8ff Binary files /dev/null and b/pandas/io/tests/data/stata9_115.dta differ diff --git a/pandas/io/tests/data/stata9_117.dta b/pandas/io/tests/data/stata9_117.dta new file mode 100644 index 0000000000000..5ad6cd6a2c8ff Binary files /dev/null and b/pandas/io/tests/data/stata9_117.dta differ diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py index b20a1e5b60b86..56ef1aa9b0f19 100644 --- a/pandas/io/tests/generate_legacy_pickles.py +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -60,7 +60,7 @@ def create_data(): from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, - date_range,period_range,bdate_range,Timestamp) + date_range,period_range,bdate_range,Timestamp,Categorical) nan = np.nan data = { @@ -85,7 +85,8 @@ def create_data(): mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), - dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + cat=Series(Categorical(['foo', 'bar', 'baz']))) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), @@ -95,7 +96,11 @@ def create_data(): ['one','two','one','two','three']])), names=['first','second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A'])) + columns=['A', 'B', 'A']), + cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), + cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3))), + ) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index 33e53fe0b8931..4855b715ebbe2 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -35,6 +35,12 @@ def setUpClass(cls): cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, 'b': np.arange(1, 6), 'c': list('abcde')}) + + # Test columns exceeding "max_colwidth" (GH8305) + _cw = get_option('display.max_colwidth') + 1 + cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) # Test GH-5346 max_rows = get_option('display.max_rows') cls.data['longdf'] = mkdf(max_rows+1, 3, data_gen_f=lambda *args: randint(2), diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 96db535347921..6d3f0b5475298 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -17,8 +17,8 @@ from pandas import DataFrame, Index, MultiIndex from pandas.io.parsers import read_csv from pandas.io.excel import ( - ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter, - register_writer, _XlsxWriter + ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _Openpyxl1Writer, + _Openpyxl2Writer, register_writer, _XlsxWriter ) from pandas.io.common import URLError from pandas.util.testing import ensure_clean @@ -1127,35 +1127,39 @@ def test_swapped_columns(self): tm.assert_series_equal(write_frame['B'], read_frame['B']) -def raise_wrapper(orig_method): - @functools.wraps(orig_method) - def wrapped(self, *args, **kwargs): - _skip_if_no_openpyxl() - if openpyxl_compat.is_compat(): - orig_method(self, *args, **kwargs) - else: - msg = 'Installed openpyxl is not supported at this time\. Use.+' - with tm.assertRaisesRegexp(ValueError, msg): +def raise_wrapper(major_ver): + def versioned_raise_wrapper(orig_method): + @functools.wraps(orig_method) + def wrapped(self, *args, **kwargs): + _skip_if_no_openpyxl() + if openpyxl_compat.is_compat(major_ver=major_ver): orig_method(self, *args, **kwargs) - return wrapped + else: + msg = 'Installed openpyxl is not supported at this time\. Use.+' + with tm.assertRaisesRegexp(ValueError, msg): + orig_method(self, *args, **kwargs) + return wrapped + return versioned_raise_wrapper -def raise_on_incompat_version(cls): - methods = filter(operator.methodcaller('startswith', 'test_'), dir(cls)) - for method in methods: - setattr(cls, method, raise_wrapper(getattr(cls, method))) - return cls +def raise_on_incompat_version(major_ver): + def versioned_raise_on_incompat_version(cls): + methods = filter(operator.methodcaller('startswith', 'test_'), dir(cls)) + for method in methods: + setattr(cls, method, raise_wrapper(major_ver)(getattr(cls, method))) + return cls + return versioned_raise_on_incompat_version -@raise_on_incompat_version +@raise_on_incompat_version(1) class OpenpyxlTests(ExcelWriterBase, tm.TestCase): ext = '.xlsx' - engine_name = 'openpyxl' + engine_name = 'openpyxl1' check_skip = staticmethod(lambda *args, **kwargs: None) def test_to_excel_styleconverter(self): _skip_if_no_openpyxl() - if not openpyxl_compat.is_compat(): + if not openpyxl_compat.is_compat(major_ver=1): raise nose.SkipTest('incompatiable openpyxl version') import openpyxl @@ -1167,7 +1171,7 @@ def test_to_excel_styleconverter(self): "left": "thin"}, "alignment": {"horizontal": "center", "vertical": "top"}} - xlsx_style = _OpenpyxlWriter._convert_to_style(hstyle) + xlsx_style = _Openpyxl1Writer._convert_to_style(hstyle) self.assertTrue(xlsx_style.font.bold) self.assertEqual(openpyxl.style.Border.BORDER_THIN, xlsx_style.borders.top.border_style) @@ -1183,6 +1187,115 @@ def test_to_excel_styleconverter(self): xlsx_style.alignment.vertical) +@raise_on_incompat_version(2) +class Openpyxl2Tests(ExcelWriterBase, tm.TestCase): + ext = '.xlsx' + engine_name = 'openpyxl2' + check_skip = staticmethod(lambda *args, **kwargs: None) + + def test_to_excel_styleconverter(self): + _skip_if_no_openpyxl() + if not openpyxl_compat.is_compat(major_ver=2): + raise nose.SkipTest('incompatiable openpyxl version') + + import openpyxl + from openpyxl import styles + + hstyle = { + "font": { + "color": '00FF0000', + "bold": True, + }, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": { + "horizontal": "center", + "vertical": "top", + }, + "fill": { + "patternType": 'solid', + 'fgColor': { + 'rgb': '006666FF', + 'tint': 0.3, + }, + }, + "number_format": { + "format_code": "0.00" + }, + "protection": { + "locked": True, + "hidden": False, + }, + } + + font_color = styles.Color('00FF0000') + font = styles.Font(bold=True, color=font_color) + side = styles.Side(style=styles.borders.BORDER_THIN) + border = styles.Border(top=side, right=side, bottom=side, left=side) + alignment = styles.Alignment(horizontal='center', vertical='top') + fill_color = styles.Color(rgb='006666FF', tint=0.3) + fill = styles.PatternFill(patternType='solid', fgColor=fill_color) + + # ahh openpyxl API changes + ver = openpyxl.__version__ + if ver >= LooseVersion('2.0.0') and ver < LooseVersion('2.1.0'): + number_format = styles.NumberFormat(format_code='0.00') + else: + number_format = '0.00' # XXX: Only works with openpyxl-2.1.0 + + protection = styles.Protection(locked=True, hidden=False) + + kw = _Openpyxl2Writer._convert_to_style_kwargs(hstyle) + self.assertEqual(kw['font'], font) + self.assertEqual(kw['border'], border) + self.assertEqual(kw['alignment'], alignment) + self.assertEqual(kw['fill'], fill) + self.assertEqual(kw['number_format'], number_format) + self.assertEqual(kw['protection'], protection) + + + def test_write_cells_merge_styled(self): + _skip_if_no_openpyxl() + if not openpyxl_compat.is_compat(major_ver=2): + raise nose.SkipTest('incompatiable openpyxl version') + + from pandas.core.format import ExcelCell + from openpyxl import styles + + sheet_name='merge_styled' + + sty_b1 = {'font': {'color': '00FF0000'}} + sty_a2 = {'font': {'color': '0000FF00'}} + + initial_cells = [ + ExcelCell(col=1, row=0, val=42, style=sty_b1), + ExcelCell(col=0, row=1, val=99, style=sty_a2), + ] + + sty_merged = {'font': { 'color': '000000FF', 'bold': True }} + sty_kwargs = _Openpyxl2Writer._convert_to_style_kwargs(sty_merged) + openpyxl_sty_merged = styles.Style(**sty_kwargs) + merge_cells = [ + ExcelCell(col=0, row=0, val='pandas', + mergestart=1, mergeend=1, style=sty_merged), + ] + + with ensure_clean('.xlsx') as path: + writer = _Openpyxl2Writer(path) + writer.write_cells(initial_cells, sheet_name=sheet_name) + writer.write_cells(merge_cells, sheet_name=sheet_name) + + wks = writer.sheets[sheet_name] + xcell_b1 = wks.cell('B1') + xcell_a2 = wks.cell('A2') + self.assertEqual(xcell_b1.style, openpyxl_sty_merged) + self.assertEqual(xcell_a2.style, openpyxl_sty_merged) + + class XlwtTests(ExcelWriterBase, tm.TestCase): ext = '.xls' engine_name = 'xlwt' @@ -1216,7 +1329,6 @@ class XlsxWriterTests(ExcelWriterBase, tm.TestCase): check_skip = staticmethod(_skip_if_no_xlsxwriter) -@raise_on_incompat_version class OpenpyxlTests_NoMerge(ExcelWriterBase, tm.TestCase): ext = '.xlsx' engine_name = 'openpyxl' @@ -1254,9 +1366,9 @@ def test_ExcelWriter_dispatch(self): writer_klass = _XlsxWriter except ImportError: _skip_if_no_openpyxl() - if not openpyxl_compat.is_compat(): + if not openpyxl_compat.is_compat(major_ver=1): raise nose.SkipTest('incompatible openpyxl version') - writer_klass = _OpenpyxlWriter + writer_klass = _Openpyxl1Writer with ensure_clean('.xlsx') as path: writer = ExcelWriter(path) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 834140e3de746..748a008ae2c4b 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -401,6 +401,32 @@ def test_thousands_macau_index_col(self): self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + def test_empty_tables(self): + """ + Make sure that read_html ignores empty tables. + """ + data1 = ''' + + + + + + + + + + + + +
AB
12
''' + data2 = data1 + ''' + + +
''' + res1 = self.read_html(StringIO(data1)) + res2 = self.read_html(StringIO(data2)) + assert_framelist_equal(res1, res2) + def test_countries_municipalities(self): # GH5048 data1 = StringIO(''' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index fd1febc37caac..44daeaca4ed32 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -263,6 +263,15 @@ def test_squeeze(self): tm.assert_isinstance(result, Series) tm.assert_series_equal(result, expected) + def test_squeeze_no_view(self): + + # GH 8217 + # series should not be a view + + data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" + result = self.read_csv(StringIO(data), index_col='time', squeeze=True) + self.assertFalse(result._is_view) + def test_inf_parsing(self): data = """\ ,A @@ -723,7 +732,6 @@ def f(i, v): return buf data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ])) - expected = DataFrame(np.nan,columns=range(nv),index=range(nv)) df = self.read_csv(data, header=None) tm.assert_frame_equal(df, expected) @@ -1279,11 +1287,11 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) #### invalid options #### @@ -2514,6 +2522,12 @@ def test_verbose_import(self): finally: sys.stdout = sys.__stdout__ + def test_float_precision_specified(self): + # Should raise an error if float_precision (C parser option) is specified + with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option " + "is not supported with the 'python' engine"): + self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high') + def test_iteration_open_handle(self): if PY3: raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info)) @@ -2794,6 +2808,58 @@ def test_read_table_buglet_4x_multiindex(self): actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_empty_lines(self): + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + tm.assert_almost_equal(df.values, expected) + expected = [[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data), skip_blank_lines=False) + tm.assert_almost_equal(list(df.values), list(expected)) + + def test_whitespace_lines(self): + data = """ + +\t \t\t + \t +A,B,C + \t 1,2.,4. +5.,NaN,10.0 +""" + expected = [[1, 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): # File with all values @@ -2944,6 +3010,14 @@ def test_line_comment(self): [5., np.nan, 10.]] df = self.read_csv(StringIO(data), comment='#') tm.assert_almost_equal(df.values, expected) + # check with delim_whitespace=True + df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', + delim_whitespace=True) + tm.assert_almost_equal(df.values, expected) + # check with custom line terminator + df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', + lineterminator='*') + tm.assert_almost_equal(df.values, expected) def test_comment_skiprows(self): data = """# empty @@ -2992,6 +3066,46 @@ def test_comment_skiprows_header(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) tm.assert_almost_equal(df.values, expected) + def test_empty_lines(self): + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + tm.assert_almost_equal(df.values, expected) + expected = [[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data), skip_blank_lines=False) + tm.assert_almost_equal(list(df.values), list(expected)) + + def test_whitespace_lines(self): + data = """ + +\t \t\t + \t +A,B,C + \t 1,2.,4. +5.,NaN,10.0 +""" + expected = [[1, 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + def test_passing_dtype(self): # GH 6607 # This is a copy which should eventually be merged into ParserTests @@ -3071,6 +3185,25 @@ def test_compact_ints(self): ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype) + def test_precise_conversion(self): + # GH #8002 + from decimal import Decimal + normal_errors = [] + precise_errors = [] + for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2 + text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision + normal_val = float(self.read_csv(StringIO(text))['a'][0]) + precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0]) + roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0]) + actual_val = Decimal(text[2:]) + def error(val): + return abs(Decimal('{0:.100}'.format(val)) - actual_val) + normal_errors.append(error(normal_val)) + precise_errors.append(error(precise_val)) + self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float() + self.assertTrue(sum(precise_errors) <= sum(normal_errors)) + self.assertTrue(max(precise_errors) <= max(normal_errors)) + def test_pass_dtype(self): data = """\ one,two @@ -3438,6 +3571,7 @@ def test_compare_whitespace_regex(self): data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' result_c = pd.read_table(StringIO(data), sep='\s+', engine='c') result_py = pd.read_table(StringIO(data), sep='\s+', engine='python') + print(result_c) tm.assert_frame_equal(result_c, result_py) def test_fallback_to_python(self): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index c1419ef2d023e..da9d39ae82617 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -198,8 +198,8 @@ def test_long_strings(self): # GH6166 # unconversion of long strings was being chopped in earlier # versions of numpy < 1.7.2 - df = DataFrame({'a': [tm.rands(100) for _ in range(10)]}, - index=[tm.rands(100) for _ in range(10)]) + df = DataFrame({'a': tm.rands_array(100, size=10)}, + index=tm.rands_array(100, size=10)) with ensure_clean_store(self.path) as store: store.append('df', df, data_columns=['a']) @@ -288,6 +288,10 @@ def test_api(self): self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') + #File path doesn't exist + path = "" + self.assertRaises(IOError, read_hdf, path, 'df') + def test_api_default_format(self): # default_format option @@ -2320,7 +2324,7 @@ def test_remove_startstop(self): n = store.remove('wp5', start=16, stop=-16) self.assertTrue(n == 120-32) result = store.select('wp5') - expected = wp.reindex(major_axis=wp.major_axis[:16//4]+wp.major_axis[-16//4:]) + expected = wp.reindex(major_axis=wp.major_axis[:16//4].union(wp.major_axis[-16//4:])) assert_panel_equal(result, expected) _maybe_remove(store, 'wp6') @@ -2339,7 +2343,7 @@ def test_remove_startstop(self): n = store.remove('wp7', where=[crit], stop=80) self.assertTrue(n == 28) result = store.select('wp7') - expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) + expected = wp.reindex(major_axis=wp.major_axis.difference(wp.major_axis[np.arange(0,20,3)])) assert_panel_equal(result, expected) def test_remove_crit(self): @@ -2357,7 +2361,7 @@ def test_remove_crit(self): self.assertTrue(n == 36) result = store.select('wp3') - expected = wp.reindex(major_axis=wp.major_axis - date4) + expected = wp.reindex(major_axis=wp.major_axis.difference(date4)) assert_panel_equal(result, expected) # upper half @@ -2385,7 +2389,7 @@ def test_remove_crit(self): crit1 = Term('major_axis=date1') store.remove('wp2', where=[crit1]) result = store.select('wp2') - expected = wp.reindex(major_axis=wp.major_axis - date1) + expected = wp.reindex(major_axis=wp.major_axis.difference(date1)) assert_panel_equal(result, expected) date2 = wp.major_axis[5] @@ -2393,7 +2397,7 @@ def test_remove_crit(self): store.remove('wp2', where=[crit2]) result = store['wp2'] expected = wp.reindex( - major_axis=wp.major_axis - date1 - Index([date2])) + major_axis=wp.major_axis.difference(date1).difference(Index([date2]))) assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] @@ -2401,7 +2405,7 @@ def test_remove_crit(self): store.remove('wp2', where=[crit3]) result = store['wp2'] expected = wp.reindex( - major_axis=wp.major_axis - date1 - Index([date2]) - Index(date3)) + major_axis=wp.major_axis.difference(date1).difference(Index([date2])).difference(Index(date3))) assert_panel_equal(result, expected) # corners @@ -4541,7 +4545,7 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], levels=['a','b','c','d'])) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'])) self.assertRaises(NotImplementedError, store.put, 's_fixed', s, format='fixed') self.assertRaises(NotImplementedError, store.append, 's_table', s, format='table') diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 6a0130e515d59..2099a8d0de82e 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -6,12 +6,12 @@ - Tests for the public API (only tests with sqlite3) - `_TestSQLApi` base class - `TestSQLApi`: test the public API with sqlalchemy engine - - `TesySQLLegacyApi`: test the public API with DBAPI connection + - `TesySQLiteFallbackApi`: test the public API with a sqlite DBAPI connection - Tests for the different SQL flavors (flavor specific type conversions) - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with common methods, the different tested flavors (sqlite3, MySQL, PostgreSQL) derive from the base class - - Tests for the legacy mode (`TestSQLiteLegacy` and `TestMySQLLegacy`) + - Tests for the fallback mode (`TestSQLiteFallback` and `TestMySQLLegacy`) """ @@ -26,15 +26,16 @@ import warnings import numpy as np -from datetime import datetime +from datetime import datetime, date, time -from pandas import DataFrame, Series, Index, MultiIndex, isnull +from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat from pandas import date_range, to_datetime, to_timedelta import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types from pandas.core.datetools import format as date_format import pandas.io.sql as sql +from pandas.io.sql import read_sql_table, read_sql_query import pandas.util.testing as tm @@ -198,7 +199,7 @@ def _load_test2_data(self): E=['1990-11-22', '1991-10-26', '1993-11-26', '1995-12-12'])) df['E'] = to_datetime(df['E']) - self.test_frame3 = df + self.test_frame2 = df def _load_test3_data(self): columns = ['index', 'A', 'B'] @@ -227,19 +228,19 @@ def _count_rows(self, table_name): return result[0] def _read_sql_iris(self): - iris_frame = self.pandasSQL.read_sql("SELECT * FROM iris") + iris_frame = self.pandasSQL.read_query("SELECT * FROM iris") self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_parameter(self): query = SQL_STRINGS['read_parameters'][self.flavor] params = ['Iris-setosa', 5.1] - iris_frame = self.pandasSQL.read_sql(query, params=params) + iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_named_parameter(self): query = SQL_STRINGS['read_named_parameters'][self.flavor] params = {'name': 'Iris-setosa', 'length': 5.1} - iris_frame = self.pandasSQL.read_sql(query, params=params) + iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _to_sql(self): @@ -252,6 +253,10 @@ def _to_sql(self): # Nuke table self.drop_table('test_frame1') + def _to_sql_empty(self): + self.drop_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1.iloc[:0], 'test_frame1') + def _to_sql_fail(self): self.drop_table('test_frame1') @@ -308,7 +313,7 @@ def _to_sql_append(self): def _roundtrip(self): self.drop_table('test_frame_roundtrip') self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip') - result = self.pandasSQL.read_sql('SELECT * FROM test_frame_roundtrip') + result = self.pandasSQL.read_query('SELECT * FROM test_frame_roundtrip') result.set_index('level_0', inplace=True) # result.index.astype(int) @@ -323,6 +328,35 @@ def _execute_sql(self): row = iris_results.fetchone() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + def _to_sql_save_index(self): + df = DataFrame.from_records([(1,2.1,'line1'), (2,1.5,'line2')], + columns=['A','B','C'], index=['A']) + self.pandasSQL.to_sql(df, 'test_to_sql_saves_index') + ix_cols = self._get_index_columns('test_to_sql_saves_index') + self.assertEqual(ix_cols, [['A',],]) + + def _transaction_test(self): + self.pandasSQL.execute("CREATE TABLE test_trans (A INT, B TEXT)") + + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + + # Make sure when transaction is rolled back, no rows get inserted + try: + with self.pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise Exception('error') + except: + # ignore raised exception + pass + res = self.pandasSQL.read_query('SELECT * FROM test_trans') + self.assertEqual(len(res), 0) + + # Make sure when transaction is committed, rows do get inserted + with self.pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + res2 = self.pandasSQL.read_query('SELECT * FROM test_trans') + self.assertEqual(len(res2), 1) + #------------------------------------------------------------------------------ #--- Testing the public API @@ -333,7 +367,7 @@ class _TestSQLApi(PandasSQLTest): Base class to test the public API. From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the legacy mode (`TestSQLLegacyApi`). + sqlalchemy mode (`TestSQLApi`) and the fallback mode (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific tests for the different sql flavours are included in `_TestSQLAlchemy`. @@ -345,6 +379,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = 'sqlite' + mode = None def setUp(self): self.conn = self.connect() @@ -455,6 +490,14 @@ def test_roundtrip(self): result.index.name = None tm.assert_frame_equal(result, self.test_frame1) + def test_roundtrip_chunksize(self): + sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, + index=False, flavor='sqlite', chunksize=2) + result = sql.read_sql_query( + 'SELECT * FROM test_frame_roundtrip', + con=self.conn) + tm.assert_frame_equal(result, self.test_frame1) + def test_execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done iris_results = sql.execute("SELECT * FROM iris", con=self.conn) @@ -507,6 +550,7 @@ def test_date_and_index(self): "IntDateCol loaded with incorrect type") def test_timedelta(self): + # see #6921 df = to_timedelta(Series(['00:00:01', '00:00:03'], name='foo')).to_frame() with tm.assert_produces_warning(UserWarning): @@ -582,13 +626,13 @@ def test_to_sql_index_label_multiindex(self): index_label='C') def test_multiindex_roundtrip(self): - df = DataFrame.from_records([(1,2.1,'line1'), (2,1.5,'line2')], + df = DataFrame.from_records([(1,2.1,'line1'), (2,1.5,'line2')], columns=['A','B','C'], index=['A','B']) df.to_sql('test_multiindex_roundtrip', self.conn) - result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip', + result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip', self.conn, index_col=['A','B']) - tm.assert_frame_equal(df, result, check_index_type=True) + tm.assert_frame_equal(df, result, check_index_type=True) def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) @@ -600,6 +644,40 @@ def test_get_schema(self): con=self.conn) self.assertTrue('CREATE' in create_sql) + def test_chunksize_read(self): + df = DataFrame(np.random.randn(22, 5), columns=list('abcde')) + df.to_sql('test_chunksize', self.conn, index=False) + + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", self.conn) + + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_query("select * from test_chunksize", + self.conn, chunksize=5): + res2 = concat([res2, chunk], ignore_index=True) + self.assertEqual(len(chunk), sizes[i]) + i += 1 + + tm.assert_frame_equal(res1, res2) + + # reading the query in chunks with read_sql_query + if self.mode == 'sqlalchemy': + res3 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_table("test_chunksize", self.conn, + chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) + self.assertEqual(len(chunk), sizes[i]) + i += 1 + + tm.assert_frame_equal(res1, res3) + class TestSQLApi(_TestSQLApi): """ @@ -610,6 +688,7 @@ class TestSQLApi(_TestSQLApi): """ flavor = 'sqlite' + mode = 'sqlalchemy' def connect(self): if SQLALCHEMY_INSTALLED: @@ -672,13 +751,34 @@ def test_not_reflect_all_tables(self): # Verify some things self.assertEqual(len(w), 0, "Warning triggered for other table") + def test_warning_case_insensitive_table_name(self): + # see GH7815. + # We can't test that this warning is triggered, a the database + # configuration would have to be altered. But here we test that + # the warning is certainly NOT triggered in a normal case. + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # This should not trigger a Warning + self.test_frame1.to_sql('CaseSensitive', self.conn) + # Verify some things + self.assertEqual(len(w), 0, "Warning triggered for writing a table") -class TestSQLLegacyApi(_TestSQLApi): + def _get_index_columns(self, tbl_name): + from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) + ixs = insp.get_indexes('test_index_saved') + ixs = [i['column_names'] for i in ixs] + return ixs + + +class TestSQLiteFallbackApi(_TestSQLApi): """ - Test the public legacy API + Test the public sqlite connection fallback API """ flavor = 'sqlite' + mode = 'fallback' def connect(self, database=":memory:"): return sqlite3.connect(database) @@ -770,7 +870,7 @@ def connect(self): def setup_connect(self): try: self.conn = self.connect() - self.pandasSQL = sql.PandasSQLAlchemy(self.conn) + self.pandasSQL = sql.SQLDatabase(self.conn) # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: @@ -791,6 +891,9 @@ def test_read_sql_named_parameter(self): def test_to_sql(self): self._to_sql() + def test_to_sql_empty(self): + self._to_sql_empty() + def test_to_sql_fail(self): self._to_sql_fail() @@ -805,7 +908,7 @@ def test_create_table(self): temp_frame = DataFrame( {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) - pandasSQL = sql.PandasSQLAlchemy(temp_conn) + pandasSQL = sql.SQLDatabase(temp_conn) pandasSQL.to_sql(temp_frame, 'temp_frame') self.assertTrue( @@ -817,7 +920,7 @@ def test_drop_table(self): temp_frame = DataFrame( {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) - pandasSQL = sql.PandasSQLAlchemy(temp_conn) + pandasSQL = sql.SQLDatabase(temp_conn) pandasSQL.to_sql(temp_frame, 'temp_frame') self.assertTrue( @@ -916,9 +1019,6 @@ def test_date_parsing(self): "IntDateCol loaded with incorrect type") def test_datetime(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.to_sql('test_datetime', self.conn) @@ -939,17 +1039,6 @@ def test_datetime(self): tm.assert_frame_equal(result, df) def test_datetime_NaT(self): - # status: - # - postgresql: gives error on inserting "0001-255-255T00:00:00" - # - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001' - - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - if self.driver == 'psycopg2': - raise nose.SkipTest('writing datetime NaT not working with psycopg2') - if self.flavor == 'sqlite': - raise nose.SkipTest('reading datetime NaT not working with sqlite') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.loc[1, 'A'] = np.nan @@ -968,6 +1057,21 @@ def test_datetime_NaT(self): else: tm.assert_frame_equal(result, df) + def test_datetime_date(self): + # test support for datetime.date + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + df.to_sql('test_date', self.conn, index=False) + res = read_sql_table('test_date', self.conn) + # comes back as datetime64 + tm.assert_series_equal(res['a'], to_datetime(df['a'])) + + def test_datetime_time(self): + # test support for datetime.time + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + df.to_sql('test_time', self.conn, index=False) + res = read_sql_table('test_time', self.conn) + tm.assert_frame_equal(res, df) + def test_mixed_dtype_insert(self): # see GH6509 s1 = Series(2**25 + 1,dtype=np.int32) @@ -981,9 +1085,6 @@ def test_mixed_dtype_insert(self): tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) def test_nan_numeric(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in numeric float column df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]}) df.to_sql('test_nan', self.conn, index=False) @@ -997,37 +1098,27 @@ def test_nan_numeric(self): tm.assert_frame_equal(result, df) def test_nan_fullcolumn(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # full NaN column (numeric float column) df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df['B'] = df['B'].astype('object') - df['B'] = None - # with read_table result = sql.read_sql_table('test_nan', self.conn) tm.assert_frame_equal(result, df) - # with read_sql + # with read_sql -> not type info from table -> stays None + df['B'] = df['B'].astype('object') + df['B'] = None result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) tm.assert_frame_equal(result, df) def test_nan_string(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in string column df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df.loc[2, 'B'] = None - elif self.flavor == 'postgresql': - df = df.fillna('NaN') + # NaNs are coming back as None + df.loc[2, 'B'] = None # with read_table result = sql.read_sql_table('test_nan', self.conn) @@ -1037,6 +1128,18 @@ def test_nan_string(self): result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) tm.assert_frame_equal(result, df) + def _get_index_columns(self, tbl_name): + from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) + ixs = insp.get_indexes(tbl_name) + ixs = [i['column_names'] for i in ixs] + return ixs + + def test_to_sql_save_index(self): + self._to_sql_save_index() + + def test_transactions(self): + self._transaction_test() class TestSQLiteAlchemy(_TestSQLAlchemy): """ @@ -1172,8 +1275,8 @@ class TestPostgreSQLAlchemy(_TestSQLAlchemy): flavor = 'postgresql' def connect(self): - return sqlalchemy.create_engine( - 'postgresql+{driver}://postgres@localhost/pandas_nosetest'.format(driver=self.driver)) + url = 'postgresql+{driver}://postgres@localhost/pandas_nosetest' + return sqlalchemy.create_engine(url.format(driver=self.driver)) def setup_driver(self): try: @@ -1189,13 +1292,68 @@ def tearDown(self): for table in c.fetchall(): self.conn.execute("DROP TABLE %s" % table[0]) + def test_schema_support(self): + # only test this for postgresql (schema's not supported in mysql/sqlite) + df = DataFrame({'col1':[1, 2], 'col2':[0.1, 0.2], 'col3':['a', 'n']}) + + # create a schema + self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") + self.conn.execute("CREATE SCHEMA other;") + + # write dataframe to different schema's + df.to_sql('test_schema_public', self.conn, index=False) + df.to_sql('test_schema_public_explicit', self.conn, index=False, + schema='public') + df.to_sql('test_schema_other', self.conn, index=False, schema='other') + + # read dataframes back in + res1 = sql.read_sql_table('test_schema_public', self.conn) + tm.assert_frame_equal(df, res1) + res2 = sql.read_sql_table('test_schema_public_explicit', self.conn) + tm.assert_frame_equal(df, res2) + res3 = sql.read_sql_table('test_schema_public_explicit', self.conn, + schema='public') + tm.assert_frame_equal(df, res3) + res4 = sql.read_sql_table('test_schema_other', self.conn, + schema='other') + tm.assert_frame_equal(df, res4) + self.assertRaises(ValueError, sql.read_sql_table, 'test_schema_other', + self.conn, schema='public') + + ## different if_exists options + + # create a schema + self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") + self.conn.execute("CREATE SCHEMA other;") + + # write dataframe with different if_exists options + df.to_sql('test_schema_other', self.conn, schema='other', index=False) + df.to_sql('test_schema_other', self.conn, schema='other', index=False, + if_exists='replace') + df.to_sql('test_schema_other', self.conn, schema='other', index=False, + if_exists='append') + res = sql.read_sql_table('test_schema_other', self.conn, schema='other') + tm.assert_frame_equal(concat([df, df], ignore_index=True), res) + + ## specifying schema in user-provided meta + + engine2 = self.connect() + meta = sqlalchemy.MetaData(engine2, schema='other') + pdsql = sql.SQLDatabase(engine2, meta=meta) + pdsql.to_sql(df, 'test_schema_other2', index=False) + pdsql.to_sql(df, 'test_schema_other2', index=False, if_exists='replace') + pdsql.to_sql(df, 'test_schema_other2', index=False, if_exists='append') + res1 = sql.read_sql_table('test_schema_other2', self.conn, schema='other') + res2 = pdsql.read_table('test_schema_other2') + tm.assert_frame_equal(res1, res2) + #------------------------------------------------------------------------------ #--- Test Sqlite / MySQL fallback -class TestSQLiteLegacy(PandasSQLTest): +class TestSQLiteFallback(PandasSQLTest): """ - Test the legacy mode against an in-memory sqlite database. + Test the fallback mode against an in-memory sqlite database. """ flavor = 'sqlite' @@ -1210,7 +1368,7 @@ def drop_table(self, table_name): def setUp(self): self.conn = self.connect() - self.pandasSQL = sql.PandasSQLLegacy(self.conn, 'sqlite') + self.pandasSQL = sql.SQLiteDatabase(self.conn, 'sqlite') self._load_iris_data() @@ -1218,7 +1376,7 @@ def setUp(self): def test_invalid_flavor(self): self.assertRaises( - NotImplementedError, sql.PandasSQLLegacy, self.conn, 'oracle') + NotImplementedError, sql.SQLiteDatabase, self.conn, 'oracle') def test_read_sql(self): self._read_sql_iris() @@ -1232,6 +1390,9 @@ def test_read_sql_named_parameter(self): def test_to_sql(self): self._to_sql() + def test_to_sql_empty(self): + self._to_sql_empty() + def test_to_sql_fail(self): self._to_sql_fail() @@ -1261,8 +1422,43 @@ def test_roundtrip(self): def test_execute_sql(self): self._execute_sql() - -class TestMySQLLegacy(TestSQLiteLegacy): + def test_datetime_date(self): + # test support for datetime.date + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + df.to_sql('test_date', self.conn, index=False, flavor=self.flavor) + res = read_sql_query('SELECT * FROM test_date', self.conn) + if self.flavor == 'sqlite': + # comes back as strings + tm.assert_frame_equal(res, df.astype(str)) + elif self.flavor == 'mysql': + tm.assert_frame_equal(res, df) + + def test_datetime_time(self): + # test support for datetime.time + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + # test it raises an error and not fails silently (GH8341) + if self.flavor == 'sqlite': + self.assertRaises(sqlite3.InterfaceError, sql.to_sql, df, + 'test_time', self.conn) + + def _get_index_columns(self, tbl_name): + ixs = sql.read_sql_query( + "SELECT * FROM sqlite_master WHERE type = 'index' " + + "AND tbl_name = '%s'" % tbl_name, self.conn) + ix_cols = [] + for ix_name in ixs.name: + ix_info = sql.read_sql_query( + "PRAGMA index_info(%s)" % ix_name, self.conn) + ix_cols.append(ix_info.name.tolist()) + return ix_cols + + def test_to_sql_save_index(self): + self._to_sql_save_index() + + def test_transactions(self): + self._transaction_test() + +class TestMySQLLegacy(TestSQLiteFallback): """ Test the legacy mode against a MySQL database. @@ -1296,7 +1492,7 @@ def setUp(self): except self.driver.err.OperationalError: raise nose.SkipTest("Can't connect to MySQL server") - self.pandasSQL = sql.PandasSQLLegacy(self.conn, 'mysql') + self.pandasSQL = sql.SQLiteDatabase(self.conn, 'mysql') self._load_iris_data() self._load_test1_data() @@ -1317,6 +1513,28 @@ def test_a_deprecation(self): sql.has_table('test_frame1', self.conn, flavor='mysql'), 'Table not written to DB') + def _get_index_columns(self, tbl_name): + ixs = sql.read_sql_query( + "SHOW INDEX IN %s" % tbl_name, self.conn) + ix_cols = {} + for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): + if ix_name not in ix_cols: + ix_cols[ix_name] = [] + ix_cols[ix_name].append(ix_col) + return list(ix_cols.values()) + + def test_to_sql_save_index(self): + self._to_sql_save_index() + + for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): + if ix_name not in ix_cols: + ix_cols[ix_name] = [] + ix_cols[ix_name].append(ix_col) + return ix_cols.values() + + def test_to_sql_save_index(self): + self._to_sql_save_index() + #------------------------------------------------------------------------------ #--- Old tests from 0.13.1 (before refactor using sqlalchemy) @@ -1413,7 +1631,7 @@ def test_schema(self): frame = tm.makeTimeDataFrame() create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) lines = create_sql.splitlines() - self.assertTrue('PRIMARY KEY (A,B)' in create_sql) + self.assertTrue('PRIMARY KEY ([A],[B])' in create_sql) cur = self.db.cursor() cur.execute(create_sql) @@ -1692,7 +1910,7 @@ def test_schema(self): drop_sql = "DROP TABLE IF EXISTS test" create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) lines = create_sql.splitlines() - self.assertTrue('PRIMARY KEY (A,B)' in create_sql) + self.assertTrue('PRIMARY KEY (`A`,`B`)' in create_sql) cur = self.db.cursor() cur.execute(drop_sql) cur.execute(create_sql) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 459a1fe6c0e89..2cb7809166be5 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -5,17 +5,20 @@ import os import warnings import nose +import struct import sys from distutils.version import LooseVersion import numpy as np import pandas as pd +from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, - PossiblePrecisionLoss) + PossiblePrecisionLoss, StataMissingValue) import pandas.util.testing as tm +from pandas.tslib import NaT from pandas.util.misc import is_little_endian from pandas import compat @@ -71,7 +74,16 @@ def setUp(self): self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') + self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') + self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') + self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') + + self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') + self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + + def read_dta(self, file): + # Legacy default reader configuration return read_stata(file, convert_dates=True) def read_csv(self, file): @@ -589,6 +601,148 @@ def test_excessively_long_string(self): with tm.ensure_clean() as path: original.to_stata(path) + def test_missing_value_generator(self): + types = ('b','h','l') + df = DataFrame([[0.0]],columns=['float_']) + with tm.ensure_clean() as path: + df.to_stata(path) + valid_range = StataReader(path).VALID_RANGE + expected_values = ['.' + chr(97 + i) for i in range(26)] + expected_values.insert(0, '.') + for t in types: + offset = valid_range[t][1] + for i in range(0,27): + val = StataMissingValue(offset+1+i) + self.assertTrue(val.string == expected_values[i]) + + # Test extremes for floats + val = StataMissingValue(struct.unpack(' 0 + global errno lines = line_end - line_start result = np.empty(lines, dtype=np.float64) data = result.data @@ -1436,8 +1463,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_count += 1 data[0] = NA else: - error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) - if error != 1: + data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, + parser.thousands, 1) + if errno != 0 or p_end[0] or p_end == word: if strcasecmp(word, cinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: @@ -1452,8 +1480,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, else: for i in range(lines): word = COLITER_NEXT(it) - error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) - if error != 1: + data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, + parser.thousands, 1) + if errno != 0 or p_end[0] or p_end == word: if strcasecmp(word, cinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index 5747285deb988..55adad3610816 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -4,6 +4,7 @@ """ from __future__ import print_function +from distutils.version import LooseVersion from pandas.compat import zip, range import numpy as np @@ -72,7 +73,7 @@ def _list(item): return list(item) except TypeError: return [] - + # For iris3, HairEyeColor, UCBAdmissions, Titanic dim = list(obj.dim) values = np.array(list(obj)) @@ -101,9 +102,9 @@ def _convert_vector(obj): except AttributeError: return list(obj) if 'names' in attributes: - return pd.Series(list(obj), index=r['names'](obj)) + return pd.Series(list(obj), index=r['names'](obj)) elif 'tsp' in attributes: - return pd.Series(list(obj), index=r['time'](obj)) + return pd.Series(list(obj), index=r['time'](obj)) elif 'labels' in attributes: return pd.Series(list(obj), index=r['labels'](obj)) if _rclass(obj) == 'dist': @@ -268,6 +269,7 @@ def convert_to_r_posixct(obj): np.str: robj.StrVector, np.bool: robj.BoolVector} + NA_TYPES = {np.float64: robj.NA_Real, np.float32: robj.NA_Real, np.float: robj.NA_Real, @@ -279,6 +281,16 @@ def convert_to_r_posixct(obj): np.bool: robj.NA_Logical} +if LooseVersion(np.__version__) >= LooseVersion('1.8'): + for dict_ in (VECTOR_TYPES, NA_TYPES): + dict_.update({ + np.bool_: dict_[np.bool], + np.int_: dict_[np.int], + np.float_: dict_[np.float], + np.string_: dict_[np.str] + }) + + def convert_to_r_dataframe(df, strings_as_factors=False): """ Convert a pandas DataFrame to a R data.frame. diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 20bbc58cc908f..62e0e3e985775 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -427,9 +427,9 @@ def _new_like(self, new_frames): default_kind=self.default_kind) def _combinePanel(self, other, func): - items = self.items + other.items - major = self.major_axis + other.major_axis - minor = self.minor_axis + other.minor_axis + items = self.items.union(other.items) + major = self.major_axis.union(other.major_axis) + minor = self.minor_axis.union(other.minor_axis) # could check that everything's the same size, but forget it diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 91102a2fa6a18..8a9cf01375a68 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -103,13 +103,21 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, left_indexer = _get_result_indexer(left_sorter, left_indexer) right_indexer = _get_result_indexer(right_sorter, right_indexer) - if not sort: - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) - - rev = np.empty(len(left), dtype=np.int_) - rev.put(left_sorter, np.arange(len(left))) + if not sort: # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + if rev.dtype != np.int_: + rev = rev.astype(np.int_) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1e9576487b9ed..9a7303b6874db 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -693,15 +693,34 @@ int tokenize_delimited(parser_t *self, size_t line_limit) if (c == '\n') { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } break; - } else if (c == '\r') { - self->state = EAT_CRNL; + } + else if (c == '\r') { + if (self->skip_empty_lines) + { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else + self->state = EAT_CRNL; break; - } else if (c == self->commentchar) { + } + else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; } + else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } /* normal character - handle as START_FIELD */ self->state = START_FIELD; @@ -747,6 +766,32 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } break; + case WHITESPACE_LINE: // check if line is whitespace-only + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; // ignore empty line + } + else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else if (IS_WHITESPACE(c) && c != self->delimiter) + ; + else { // backtrack + /* We have to use i + 1 because buf has been incremented but not i */ + while (i + 1 > self->datapos && *buf != '\n') { + --buf; + --i; + } + if (i + 1 > self->datapos) // reached a newline rather than the beginning + { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -904,7 +949,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit) --buf; } break; - default: break; @@ -966,7 +1010,23 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) // start of record if (c == self->lineterminator) { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } + break; + } + else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; + } + else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) + { + self->state = WHITESPACE_LINE; break; } /* normal character - handle as START_FIELD */ @@ -1010,6 +1070,28 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) } break; + case WHITESPACE_LINE: // check if line is whitespace-only + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; // ignore empty line + } + else if (IS_WHITESPACE(c) && c != self->delimiter) + ; + else { // backtrack + /* We have to use i + 1 because buf has been incremented but not i */ + while (i + 1 > self->datapos && *buf != self->lineterminator) { + --buf; + --i; + } + if (i + 1 > self->datapos) // reached a newline rather than the beginning + { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -1103,6 +1185,13 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) } break; + case EAT_LINE_COMMENT: + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; + } + break; + case EAT_COMMENT: if (c == self->lineterminator) { END_LINE(); @@ -1163,9 +1252,27 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + case WHITESPACE_LINE: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + break; + } + else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + break; + } + // fall through case EAT_WHITESPACE: - if (!IS_WHITESPACE(c)) { + if (c == '\n') { + END_LINE(); + self->state = START_RECORD; + } else if (c == '\r') { + self->state = EAT_CRNL; + break; + } else if (!IS_WHITESPACE(c)) { // END_FIELD(); self->state = START_FIELD; // Fall through to subsequent state @@ -1178,13 +1285,32 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // start of record if (c == '\n') { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } break; } else if (c == '\r') { - self->state = EAT_CRNL; + if (self->skip_empty_lines) + { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else + self->state = EAT_CRNL; break; } else if (IS_WHITESPACE(c)) { - self->state = EAT_WHITESPACE; + /*if (self->skip_empty_lines) + self->state = WHITESPACE_LINE; + else*/ + self->state = EAT_WHITESPACE; + break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; break; } else { /* normal character - handle as START_FIELD */ @@ -1231,6 +1357,16 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) } break; + case EAT_LINE_COMMENT: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -1351,6 +1487,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) } break; + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + /* \r line terminator -- parse this character again */ + if (c != '\n' && c != self->delimiter) { + --i; + --buf; + } + break; + case EAT_COMMENT: if (c == '\n') { END_LINE(); @@ -1656,10 +1801,6 @@ void test_count_lines(char *fname) { -// forward declaration -static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); - - P_INLINE void lowercase(char *p) { for ( ; *p; ++p) *p = tolower(*p); } @@ -1669,32 +1810,6 @@ P_INLINE void uppercase(char *p) { } -/* - * `item` must be the nul-terminated string that is to be - * converted to a double. - * - * To be successful, to_double() must use *all* the characters - * in `item`. E.g. "1.q25" will fail. Leading and trailing - * spaces are allowed. - * - * `sci` is the scientific notation exponent character, usually - * either 'E' or 'D'. Case is ignored. - * - * `decimal` is the decimal point character, usually either - * '.' or ','. - * - */ - -int to_double(char *item, double *p_value, char sci, char decimal, char tsep) -{ - char *p_end; - - *p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE); - - return (errno == 0) && (!*p_end); -} - - int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) { char *p_end; @@ -1884,7 +1999,7 @@ int main(int argc, char *argv[]) // * Add tsep argument for thousands separator // -static double xstrtod(const char *str, char **endptr, char decimal, +double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing) { double number; @@ -2015,6 +2130,175 @@ static double xstrtod(const char *str, char **endptr, char decimal, return number; } +double precise_xstrtod(const char *str, char **endptr, char decimal, + char sci, char tsep, int skip_trailing) +{ + double number; + int exponent; + int negative; + char *p = (char *) str; + int num_digits; + int num_decimals; + int max_digits = 17; + int n; + // Cache powers of 10 in memory + static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, + 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, + 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, + 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, + 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, + 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, + 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, + 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, + 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, + 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, + 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, + 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, + 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, + 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, + 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, + 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, + 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, + 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, + 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, + 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, + 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, + 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, + 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, + 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, + 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, + 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, + 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, + 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, + 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, + 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, + 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + errno = 0; + + // Skip leading whitespace + while (isspace(*p)) p++; + + // Handle optional sign + negative = 0; + switch (*p) + { + case '-': negative = 1; // Fall through to increment position + case '+': p++; + } + + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits + while (isdigit(*p)) + { + if (num_digits < max_digits) + { + number = number * 10. + (*p - '0'); + num_digits++; + } + else + ++exponent; + + p++; + p += (tsep != '\0' & *p == tsep); + } + + // Process decimal part + if (*p == decimal) + { + p++; + + while (num_digits < max_digits && isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } + + if (num_digits >= max_digits) // consume extra decimal digits + while (isdigit(*p)) + ++p; + + exponent -= num_decimals; + } + + if (num_digits == 0) + { + errno = ERANGE; + return 0.0; + } + + // Correct for sign + if (negative) number = -number; + + // Process an exponent string + if (toupper(*p) == toupper(sci)) + { + // Handle optional sign + negative = 0; + switch (*++p) + { + case '-': negative = 1; // Fall through to increment pos + case '+': p++; + } + + // Process string of digits + n = 0; + while (isdigit(*p)) + { + n = n * 10 + (*p - '0'); + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + } + + if (exponent > 308) + { + errno = ERANGE; + return HUGE_VAL; + } + else if (exponent > 0) + number *= e[exponent]; + else if (exponent < -308) // subnormal + { + if (exponent < -616) // prevent invalid array access + number = 0.; + number /= e[-308 - exponent]; + number /= e[308]; + } + else + number /= e[-exponent]; + + if (number == HUGE_VAL || number == -HUGE_VAL) + errno = ERANGE; + + if (skip_trailing) { + // Skip trailing whitespace + while (isspace(*p)) p++; + } + + if (endptr) *endptr = p; + return number; +} + +double round_trip(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) +{ +#if PY_VERSION_HEX >= 0x02070000 + return PyOS_string_to_double(p, q, 0); +#else + return strtod(p, q); +#endif +} + /* float strtof(const char *str, char **endptr) { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 6af63c07f1104..0947315fbe6b7 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -12,6 +12,7 @@ See LICENSE for the license #ifndef _PARSER_COMMON_H_ #define _PARSER_COMMON_H_ +#include "Python.h" #include #include #include @@ -125,6 +126,7 @@ typedef enum { EAT_WHITESPACE, EAT_COMMENT, EAT_LINE_COMMENT, + WHITESPACE_LINE, FINISHED } ParserState; @@ -202,10 +204,13 @@ typedef struct parser_t { void *skipset; int skip_footer; + double (*converter)(const char *, char **, char, char, char, int); // error handling char *warn_msg; char *error_msg; + + int skip_empty_lines; } parser_t; @@ -257,7 +262,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); -int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep); +double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index bff070421c841..4977a80acc936 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -122,6 +122,10 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False): if np.isinf(a): assert np.isinf(b), "First object is inf, second isn't" + if np.isposinf(a): + assert np.isposinf(b), "First object is positive inf, second is negative inf" + else: + assert np.isneginf(b), "First object is negative inf, second is positive inf" else: fa, fb = a, b diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py index 6d7bf329b4bee..96b2b3e32be0d 100644 --- a/pandas/stats/interface.py +++ b/pandas/stats/interface.py @@ -83,6 +83,14 @@ def ols(**kwargs): The appropriate OLS object, which allows you to obtain betas and various statistics, such as std err, t-stat, etc. """ + + if (kwargs.get('cluster') is not None and + kwargs.get('nw_lags') is not None): + raise ValueError( + 'Pandas OLS does not work with Newey-West correction ' + 'and clustering.') + + pool = kwargs.get('pool') if 'pool' in kwargs: del kwargs['pool'] diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 44b8bbd0c9078..41a768783b1cb 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -80,8 +80,8 @@ halflife : float, optional Specify decay in terms of halflife, :math:`\alpha = 1 - exp(log(0.5) / halflife)` min_periods : int, default 0 - Number of observations in sample to require (only affects - beginning) + Minimum number of observations in window required to have a value + (otherwise result is NA). freq : None or string alias / date offset object, default=None Frequency to conform to before computing statistic adjust : boolean, default True @@ -167,6 +167,11 @@ elements, only complete pairwise observations will be used. """ +_ddof_kw = """ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. +""" + _bias_kw = r"""bias : boolean, default False Use a standard estimation bias correction """ @@ -201,25 +206,25 @@ def rolling_count(arg, window, freq=None, center=False, how=None): of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ arg = _conv_timerule(arg, freq, how) - window = min(window, len(arg)) + if not center: + window = min(window, len(arg)) return_hook, values = _process_data_structure(arg, kill_inf=False) converted = np.isfinite(values).astype(float) - result = rolling_sum(converted, window, min_periods=1, + result = rolling_sum(converted, window, min_periods=0, center=center) # already converted # putmask here? result[np.isnan(result)] = 0 - return return_hook(result) @Substitution("Unbiased moving covariance.", _binary_arg_flex, - _roll_kw%'None'+_pairwise_kw, _flex_retval, _roll_notes) + _roll_kw%'None'+_pairwise_kw+_ddof_kw, _flex_retval, _roll_notes) @Appender(_doc_template) def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, - center=False, pairwise=None, how=None): + center=False, pairwise=None, how=None, ddof=1): if window is None and isinstance(arg2, (int, float)): window = arg2 arg2 = arg1 @@ -233,7 +238,7 @@ def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, def _get_cov(X, Y): mean = lambda x: rolling_mean(x, window, min_periods, center=center) count = rolling_count(X + Y, window, center=center) - bias_adj = count / (count - 1) + bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj rs = _flex_binary_moment(arg1, arg2, _get_cov, pairwise=bool(pairwise)) return rs @@ -280,7 +285,8 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): elif isinstance(arg1, DataFrame): def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) - result.columns = frame_template.columns[result.columns] + if len(result.columns) > 0: + result.columns = frame_template.columns[result.columns] return result results = {} @@ -314,8 +320,10 @@ def dataframe_from_int_dict(data, frame_template): else: results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) p = Panel.from_dict(results).swapaxes('items', 'major') - p.major_axis = arg1.columns[p.major_axis] - p.minor_axis = arg2.columns[p.minor_axis] + if len(p.major_axis) > 0: + p.major_axis = arg1.columns[p.major_axis] + if len(p.minor_axis) > 0: + p.minor_axis = arg2.columns[p.minor_axis] return p else: raise ValueError("'pairwise' is not True/False") @@ -372,19 +380,27 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, y : type of input """ arg = _conv_timerule(arg, freq, how) - calc = lambda x: func(x, window, minp=minp, args=args, kwargs=kwargs, - **kwds) + return_hook, values = _process_data_structure(arg) - # actually calculate the moment. Faster way to do this? - if values.ndim > 1: - result = np.apply_along_axis(calc, axis, values) + + if values.size == 0: + result = values.copy() else: - result = calc(values) + # actually calculate the moment. Faster way to do this? + offset = int((window - 1) / 2.) if center else 0 + additional_nans = np.array([np.NaN] * offset) + calc = lambda x: func(np.concatenate((x, additional_nans)) if center else x, + window, minp=minp, args=args, kwargs=kwargs, + **kwds) + if values.ndim > 1: + result = np.apply_along_axis(calc, axis, values) + else: + result = calc(values) - rs = return_hook(result) if center: - rs = _center_window(rs, window, axis) - return rs + result = _center_window(result, window, axis) + + return return_hook(result) def _center_window(rs, window, axis): @@ -393,20 +409,13 @@ def _center_window(rs, window, axis): "dimensions") offset = int((window - 1) / 2.) - if isinstance(rs, (Series, DataFrame, Panel)): - rs = rs.shift(-offset, axis=axis) - else: - rs_indexer = [slice(None)] * rs.ndim - rs_indexer[axis] = slice(None, -offset) - - lead_indexer = [slice(None)] * rs.ndim - lead_indexer[axis] = slice(offset, None) - - na_indexer = [slice(None)] * rs.ndim - na_indexer[axis] = slice(-offset, None) - - rs[tuple(rs_indexer)] = np.copy(rs[tuple(lead_indexer)]) - rs[tuple(na_indexer)] = np.nan + if offset > 0: + if isinstance(rs, (Series, DataFrame, Panel)): + rs = rs.slice_shift(-offset, axis=axis) + else: + lead_indexer = [slice(None)] * rs.ndim + lead_indexer[axis] = slice(offset, None) + rs = np.copy(rs[tuple(lead_indexer)]) return rs @@ -458,50 +467,46 @@ def _get_center_of_mass(com, span, halflife): @Appender(_doc_template) def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None, adjust=True, how=None, ignore_na=False): - com = _get_center_of_mass(com, span, halflife) arg = _conv_timerule(arg, freq, how) + com = _get_center_of_mass(com, span, halflife) def _ewma(v): - result = algos.ewma(v, com, int(adjust), int(ignore_na)) - if min_periods > 1: - first_index = _first_valid_index(v) - result[first_index: first_index + min_periods - 1] = NaN - return result + return algos.ewma(v, com, int(adjust), int(ignore_na), int(min_periods)) return_hook, values = _process_data_structure(arg) - output = np.apply_along_axis(_ewma, 0, values) + if values.size == 0: + output = values.copy() + else: + output = np.apply_along_axis(_ewma, 0, values) return return_hook(output) -def _first_valid_index(arr): - # argmax scans from left - return notnull(arr).argmax() if len(arr) else 0 - - @Substitution("Exponentially-weighted moving variance", _unary_arg, _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, - freq=None, how=None, ignore_na=False): - com = _get_center_of_mass(com, span, halflife) + freq=None, how=None, ignore_na=False, adjust=True): arg = _conv_timerule(arg, freq, how) - moment2nd = ewma(arg * arg, com=com, min_periods=min_periods, ignore_na=ignore_na) - moment1st = ewma(arg, com=com, min_periods=min_periods, ignore_na=ignore_na) + com = _get_center_of_mass(com, span, halflife) - result = moment2nd - moment1st ** 2 - if not bias: - result *= (1.0 + 2.0 * com) / (2.0 * com) + def _ewmvar(v): + return algos.ewmcov(v, v, com, int(adjust), int(ignore_na), int(min_periods), int(bias)) - return result + return_hook, values = _process_data_structure(arg) + if values.size == 0: + output = values.copy() + else: + output = np.apply_along_axis(_ewmvar, 0, values) + return return_hook(output) @Substitution("Exponentially-weighted moving std", _unary_arg, _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, - ignore_na=False): + ignore_na=False, adjust=True): result = ewmvar(arg, com=com, span=span, halflife=halflife, - min_periods=min_periods, bias=bias, ignore_na=ignore_na) + min_periods=min_periods, bias=bias, adjust=adjust, ignore_na=ignore_na) return _zsqrt(result) ewmvol = ewmstd @@ -511,7 +516,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, - bias=False, freq=None, pairwise=None, how=None, ignore_na=False): + bias=False, freq=None, pairwise=None, how=None, ignore_na=False, adjust=True): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -521,17 +526,17 @@ def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, pairwise = True if pairwise is None else pairwise arg1 = _conv_timerule(arg1, freq, how) arg2 = _conv_timerule(arg2, freq, how) + com = _get_center_of_mass(com, span, halflife) def _get_ewmcov(X, Y): - mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods, - ignore_na=ignore_na) - return (mean(X * Y) - mean(X) * mean(Y)) + # X and Y have the same structure (and NaNs) when called from _flex_binary_moment() + return_hook, x_values = _process_data_structure(X) + return_hook, y_values = _process_data_structure(Y) + cov = algos.ewmcov(x_values, y_values, com, int(adjust), int(ignore_na), int(min_periods), int(bias)) + return return_hook(cov) + result = _flex_binary_moment(arg1, arg2, _get_ewmcov, pairwise=bool(pairwise)) - if not bias: - com = _get_center_of_mass(com, span, halflife) - result *= (1.0 + 2.0 * com) / (2.0 * com) - return result @@ -539,7 +544,7 @@ def _get_ewmcov(X, Y): _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, - freq=None, pairwise=None, how=None, ignore_na=False): + freq=None, pairwise=None, how=None, ignore_na=False, adjust=True): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -549,13 +554,18 @@ def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, pairwise = True if pairwise is None else pairwise arg1 = _conv_timerule(arg1, freq, how) arg2 = _conv_timerule(arg2, freq, how) + com = _get_center_of_mass(com, span, halflife) def _get_ewmcorr(X, Y): - mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods, - ignore_na=ignore_na) - var = lambda x: ewmvar(x, com=com, span=span, halflife=halflife, min_periods=min_periods, - bias=True, ignore_na=ignore_na) - return (mean(X * Y) - mean(X) * mean(Y)) / _zsqrt(var(X) * var(Y)) + # X and Y have the same structure (and NaNs) when called from _flex_binary_moment() + return_hook, x_values = _process_data_structure(X) + return_hook, y_values = _process_data_structure(Y) + cov = algos.ewmcov(x_values, y_values, com, int(adjust), int(ignore_na), int(min_periods), 1) + x_var = algos.ewmcov(x_values, x_values, com, int(adjust), int(ignore_na), int(min_periods), 1) + y_var = algos.ewmcov(y_values, y_values, com, int(adjust), int(ignore_na), int(min_periods), 1) + corr = cov / _zsqrt(x_var * y_var) + return return_hook(corr) + result = _flex_binary_moment(arg1, arg2, _get_ewmcorr, pairwise=bool(pairwise)) return result @@ -615,14 +625,14 @@ def _use_window(minp, window): return minp -def _rolling_func(func, desc, check_minp=_use_window, how=None): +def _rolling_func(func, desc, check_minp=_use_window, how=None, additional_kw=''): if how is None: how_arg_str = 'None' else: how_arg_str = "'%s"%how - @Substitution(desc, _unary_arg, _roll_kw%how_arg_str, _type_of_input_retval, - _roll_notes) + @Substitution(desc, _unary_arg, _roll_kw%how_arg_str + additional_kw, + _type_of_input_retval, _roll_notes) @Appender(_doc_template) @wraps(func) def f(arg, window, min_periods=None, freq=None, center=False, how=how, @@ -643,10 +653,12 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): how='median') _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) -rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation.', - check_minp=_require_min_periods(1)) -rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance.', - check_minp=_require_min_periods(1)) +rolling_std = _rolling_func(_ts_std, 'Moving standard deviation.', + check_minp=_require_min_periods(1), + additional_kw=_ddof_kw) +rolling_var = _rolling_func(algos.roll_var, 'Moving variance.', + check_minp=_require_min_periods(1), + additional_kw=_ddof_kw) rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.', check_minp=_require_min_periods(3)) rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.', @@ -733,11 +745,12 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ + offset = int((window - 1) / 2.) if center else 0 def call_cython(arg, window, minp, args, kwargs): minp = _use_window(minp, window) - return algos.roll_generic(arg, window, minp, func, args, kwargs) + return algos.roll_generic(arg, window, minp, offset, func, args, kwargs) return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=center, args=args, kwargs=kwargs) + center=False, args=args, kwargs=kwargs) def rolling_window(arg, window=None, win_type=None, min_periods=None, @@ -821,13 +834,19 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, arg = _conv_timerule(arg, freq, how) return_hook, values = _process_data_structure(arg) - f = lambda x: algos.roll_window(x, window, minp, avg=mean) - result = np.apply_along_axis(f, axis, values) + if values.size == 0: + result = values.copy() + else: + offset = int((len(window) - 1) / 2.) if center else 0 + additional_nans = np.array([np.NaN] * offset) + f = lambda x: algos.roll_window(np.concatenate((x, additional_nans)) if center else x, + window, minp, avg=mean) + result = np.apply_along_axis(f, axis, values) - rs = return_hook(result) if center: - rs = _center_window(rs, len(window), axis) - return rs + result = _center_window(result, len(window), axis) + + return return_hook(result) def _validate_win_type(win_type, kwargs): @@ -852,18 +871,19 @@ def _pop_args(win_type, arg_names, kwargs): return all_args -def _expanding_func(func, desc, check_minp=_use_window): - @Substitution(desc, _unary_arg, _expanding_kw, _type_of_input_retval, "") +def _expanding_func(func, desc, check_minp=_use_window, additional_kw=''): + @Substitution(desc, _unary_arg, _expanding_kw + additional_kw, + _type_of_input_retval, "") @Appender(_doc_template) @wraps(func) - def f(arg, min_periods=1, freq=None, center=False, **kwargs): - window = len(arg) + def f(arg, min_periods=1, freq=None, **kwargs): + window = max(len(arg), min_periods) if min_periods else len(arg) def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): minp = check_minp(minp, window) return func(arg, window, minp, **kwds) return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=center, **kwargs) + **kwargs) return f @@ -871,23 +891,21 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum.') expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum.') expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean.') -expanding_median = _expanding_func( - algos.roll_median_cython, 'Expanding median.') - -expanding_std = _expanding_func(_ts_std, - 'Unbiased expanding standard deviation.', - check_minp=_require_min_periods(2)) -expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance.', - check_minp=_require_min_periods(2)) -expanding_skew = _expanding_func( - algos.roll_skew, 'Unbiased expanding skewness.', - check_minp=_require_min_periods(3)) -expanding_kurt = _expanding_func( - algos.roll_kurt, 'Unbiased expanding kurtosis.', - check_minp=_require_min_periods(4)) - - -def expanding_count(arg, freq=None, center=False): +expanding_median = _expanding_func(algos.roll_median_cython, 'Expanding median.') + +expanding_std = _expanding_func(_ts_std, 'Expanding standard deviation.', + check_minp=_require_min_periods(1), + additional_kw=_ddof_kw) +expanding_var = _expanding_func(algos.roll_var, 'Expanding variance.', + check_minp=_require_min_periods(1), + additional_kw=_ddof_kw) +expanding_skew = _expanding_func(algos.roll_skew, 'Unbiased expanding skewness.', + check_minp=_require_min_periods(3)) +expanding_kurt = _expanding_func(algos.roll_kurt, 'Unbiased expanding kurtosis.', + check_minp=_require_min_periods(4)) + + +def expanding_count(arg, freq=None): """ Expanding count of number of non-NaN observations. @@ -897,8 +915,6 @@ def expanding_count(arg, freq=None, center=False): freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window. Returns ------- @@ -910,11 +926,10 @@ def expanding_count(arg, freq=None, center=False): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ - return rolling_count(arg, len(arg), freq=freq, center=center) + return rolling_count(arg, len(arg), freq=freq) -def expanding_quantile(arg, quantile, min_periods=1, freq=None, - center=False): +def expanding_quantile(arg, quantile, min_periods=1, freq=None): """Expanding quantile. Parameters @@ -928,8 +943,6 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None, freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window. Returns ------- @@ -942,14 +955,13 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None, of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ return rolling_quantile(arg, len(arg), quantile, min_periods=min_periods, - freq=freq, center=center) + freq=freq) @Substitution("Unbiased expanding covariance.", _binary_arg_flex, - _expanding_kw+_pairwise_kw, _flex_retval, "") + _expanding_kw+_pairwise_kw+_ddof_kw, _flex_retval, "") @Appender(_doc_template) -def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, - pairwise=None): +def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, pairwise=None, ddof=1): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -957,17 +969,16 @@ def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, min_periods = arg2 arg2 = arg1 pairwise = True if pairwise is None else pairwise - window = len(arg1) + len(arg2) + window = max((len(arg1) + len(arg2)), min_periods) if min_periods else (len(arg1) + len(arg2)) return rolling_cov(arg1, arg2, window, min_periods=min_periods, freq=freq, - center=center, pairwise=pairwise) + pairwise=pairwise, ddof=ddof) @Substitution("Expanding sample correlation.", _binary_arg_flex, _expanding_kw+_pairwise_kw, _flex_retval, "") @Appender(_doc_template) -def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, - pairwise=None): +def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, pairwise=None): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -975,25 +986,24 @@ def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, min_periods = arg2 arg2 = arg1 pairwise = True if pairwise is None else pairwise - window = len(arg1) + len(arg2) + window = max((len(arg1) + len(arg2)), min_periods) if min_periods else (len(arg1) + len(arg2)) return rolling_corr(arg1, arg2, window, min_periods=min_periods, - freq=freq, center=center, pairwise=pairwise) + freq=freq, pairwise=pairwise) @Substitution("Deprecated. Use expanding_corr(..., pairwise=True) instead.\n\n" "Pairwise expanding sample correlation", _pairwise_arg, _expanding_kw, _pairwise_retval, "") @Appender(_doc_template) -def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None, - center=False): +def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None): import warnings warnings.warn("expanding_corr_pairwise is deprecated, use expanding_corr(..., pairwise=True)", FutureWarning) return expanding_corr(df1, df2, min_periods=min_periods, - freq=freq, center=center, pairwise=True) + freq=freq, pairwise=True) -def expanding_apply(arg, func, min_periods=1, freq=None, center=False, +def expanding_apply(arg, func, min_periods=1, freq=None, args=(), kwargs={}): """Generic expanding function application. @@ -1008,8 +1018,6 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window. args : tuple Passed on to func kwargs : dict @@ -1025,6 +1033,6 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ - window = len(arg) + window = max(len(arg), min_periods) if min_periods else len(arg) return rolling_apply(arg, window, func, min_periods=min_periods, freq=freq, - center=center, args=args, kwargs=kwargs) + args=args, kwargs=kwargs) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index ce7f9c8a225a8..a30286479c847 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -1,12 +1,15 @@ import nose import sys import functools +import warnings from datetime import datetime from numpy.random import randn +from numpy.testing.decorators import slow import numpy as np +from distutils.version import LooseVersion -from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull +from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull, concat from pandas.util.testing import ( assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal ) @@ -17,15 +20,14 @@ N, K = 100, 10 - -class TestMoments(tm.TestCase): +class Base(tm.TestCase): _multiprocess_can_split_ = True _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) - def setUp(self): + def _create_data(self): arr = randn(N) arr[self._nan_locs] = np.NaN @@ -37,6 +39,12 @@ def setUp(self): self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) +class TestMoments(Base): + + def setUp(self): + self._create_data() + warnings.simplefilter("ignore", category=FutureWarning) + def test_centered_axis_validation(self): # ok mom.rolling_mean(Series(np.ones(10)),3,center=True ,axis=0) @@ -63,47 +71,40 @@ def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_cmov_mean(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_mean - except ImportError: - raise nose.SkipTest("no scikits.timeseries") - vals = np.random.randn(10) - xp = cmov_mean(vals, 5) + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, + 16.68, 9.48, 10.63, 14.48]) + xp = np.array([np.nan, np.nan, 9.962, 11.27 , 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) rs = mom.rolling_mean(vals, 5, center=True) - assert_almost_equal(xp.compressed(), rs[2:-2]) - assert_almost_equal(xp.mask, np.isnan(rs)) + assert_almost_equal(xp, rs) xp = Series(rs) rs = mom.rolling_mean(Series(vals), 5, center=True) assert_series_equal(xp, rs) def test_cmov_window(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") - vals = np.random.randn(10) - xp = cmov_window(vals, 5, 'boxcar') + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + xp = np.array([np.nan, np.nan, 9.962, 11.27 , 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert_almost_equal(xp.compressed(), rs[2:-2]) - assert_almost_equal(xp.mask, np.isnan(rs)) + assert_almost_equal(xp, rs) xp = Series(rs) rs = mom.rolling_window(Series(vals), 5, 'boxcar', center=True) assert_series_equal(xp, rs) def test_cmov_window_corner(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") # all nan vals = np.empty(10, dtype=float) @@ -123,24 +124,37 @@ def test_cmov_window_corner(self): self.assertEqual(len(rs), 5) def test_cmov_window_frame(self): + # Gh 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") + + vals = np.array([[ 12.18, 3.64], + [ 10.18, 9.16], + [ 13.24, 14.61], + [ 4.51, 8.11], + [ 6.15, 11.44], + [ 9.14, 6.21], + [ 11.31, 10.67], + [ 2.94, 6.51], + [ 9.42, 8.39], + [ 12.44, 7.34 ]]) + + xp = np.array([[ np.nan, np.nan], + [ np.nan, np.nan], + [ 9.252, 9.392], + [ 8.644, 9.906], + [ 8.87 , 10.208], + [ 6.81 , 8.588], + [ 7.792, 8.644], + [ 9.05 , 7.824], + [ np.nan, np.nan], + [ np.nan, np.nan]]) # DataFrame - vals = np.random.randn(10, 2) - xp = cmov_window(vals, 5, 'boxcar') rs = mom.rolling_window(DataFrame(vals), 5, 'boxcar', center=True) assert_frame_equal(DataFrame(xp), rs) def test_cmov_window_na_min_periods(self): tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") # min_periods vals = Series(np.random.randn(10)) @@ -153,39 +167,136 @@ def test_cmov_window_na_min_periods(self): assert_series_equal(xp, rs) def test_cmov_window_regular(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + xps = { + 'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, + 14.03687, 13.8567, 11.81473, np.nan, np.nan], + 'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, + 13.33889, 13.38, 12.33667, np.nan, np.nan], + 'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, + 14.3675, 14.0825, 11.5675, np.nan, np.nan], + 'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, + 14.17267, 14.65923, 11.10401, np.nan, np.nan], + 'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052, + 14.02156, 15.10512, 10.74574, np.nan, np.nan], + 'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, + 14.03559, 15.05657, 10.78514, np.nan, np.nan], + 'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, + 14.20036, 14.57726, 11.16988, np.nan, np.nan], + 'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, + 14.3675, 14.0825, 11.5675, np.nan, np.nan]} + for wt in win_types: - vals = np.random.randn(10) - xp = cmov_window(vals, 5, wt) + xp = Series(xps[wt]) + rs = mom.rolling_window(Series(vals), 5, wt, center=True) + assert_series_equal(xp, rs) + + def test_cmov_window_regular_linear_range(self): + # GH 8238 + tm._skip_if_no_scipy() + win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + for wt in win_types: rs = mom.rolling_window(Series(vals), 5, wt, center=True) - assert_series_equal(Series(xp), rs) + assert_series_equal(xp, rs) + + def test_cmov_window_regular_missing_data(self): + # GH 8238 + tm._skip_if_no_scipy() + + win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, np.nan, 10.63, 14.48]) + xps = { + 'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, + 9.1925, 12.5575, 14.3675, 15.61667, 13.655], + 'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, + 9.17869, 12.79607, 14.20036, 15.8706, 13.655], + 'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, + 9.1925, 12.5575, 14.3675, 15.61667, 13.655], + 'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, + 9.1764, 12.83559, 14.17267, 15.90976, 13.655], + 'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, + 9.56348, 12.38009, 14.20565, 15.24694, 13.69758], + 'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, + 9.16786, 13.02671, 14.03673, 16.08759, 13.65553], + 'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, + 10.34667, 12.00556, 13.82125, 14.49429, 13.765], + 'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691, + 9.16438, 13.05052, 14.02175, 16.1098, + 13.65509] + } + + for wt in win_types: + xp = Series(xps[wt]) + rs = mom.rolling_window(Series(vals), 5, wt, min_periods=3) + assert_series_equal(xp, rs) def test_cmov_window_special(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, {'width': 0.5}] + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + + xps = { + 'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, + 13.89053, 13.65671, 12.01002, np.nan, np.nan], + 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, + 11.73161, 13.08516, 12.95111, 12.74577, + np.nan, np.nan], + 'slepian': [np.nan, np.nan, 9.81073, 10.89359, 11.70284, + 12.88331, 12.96079, 12.77008, np.nan, np.nan], + 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, + 12.75129, 12.90702, 12.83757, np.nan, np.nan] + } + for wt, k in zip(win_types, kwds): - vals = np.random.randn(10) - xp = cmov_window(vals, 5, (wt,) + tuple(k.values())) + xp = Series(xps[wt]) rs = mom.rolling_window(Series(vals), 5, wt, center=True, **k) - assert_series_equal(Series(xp), rs) + assert_series_equal(xp, rs) + + def test_cmov_window_special_linear_range(self): + # GH 8238 + tm._skip_if_no_scipy() + + win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] + kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, + {'width': 0.5}] + + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + for wt, k in zip(win_types, kwds): + rs = mom.rolling_window(Series(vals), 5, wt, center=True, + **k) + assert_series_equal(xp, rs) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) @@ -232,17 +343,26 @@ def alt(x): self._check_moment_func(f, alt) def test_rolling_apply(self): - ser = Series([]) - assert_series_equal( - ser, mom.rolling_apply(ser, 10, lambda x: x.mean())) - - def roll_mean(x, window, min_periods=None, freq=None, center=False): - return mom.rolling_apply(x, window, - lambda x: x[np.isfinite(x)].mean(), - min_periods=min_periods, - freq=freq, - center=center) - self._check_moment_func(roll_mean, np.mean) + # suppress warnings about empty slices, as we are deliberately testing with a 0-length Series + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning) + + ser = Series([]) + assert_series_equal(ser, mom.rolling_apply(ser, 10, lambda x: x.mean())) + + def roll_mean(x, window, min_periods=None, freq=None, center=False): + return mom.rolling_apply(x, window, + lambda x: x[np.isfinite(x)].mean(), + min_periods=min_periods, + freq=freq, + center=center) + self._check_moment_func(roll_mean, np.mean) + + # GH 8080 + s = Series([None, None, None]) + result = mom.rolling_apply(s, 2, lambda x: len(x), min_periods=0) + expected = Series([1., 2., 2.]) + assert_series_equal(result, expected) def test_rolling_apply_out_of_bounds(self): # #1850 @@ -264,8 +384,12 @@ def test_rolling_std(self): def test_rolling_std_1obs(self): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) - expected = np.zeros(5) + expected = np.array([np.nan] * 5) + assert_almost_equal(result, expected) + result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), + 1, min_periods=1, ddof=0) + expected = np.zeros(5) assert_almost_equal(result, expected) result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), @@ -406,24 +530,16 @@ def _check_ndarray(self, func, static_comp, window=50, result = func(arr, 50) assert_almost_equal(result[-1], static_comp(arr[10:-10])) + # GH 7925 if has_center: if has_min_periods: result = func(arr, 20, min_periods=15, center=True) - expected = func(arr, 20, min_periods=15) + expected = func(np.concatenate((arr, np.array([np.NaN] * 9))), 20, min_periods=15)[9:] else: result = func(arr, 20, center=True) - expected = func(arr, 20) + expected = func(np.concatenate((arr, np.array([np.NaN] * 9))), 20)[9:] - assert_almost_equal(result[1], expected[10]) - if fill_value is None: - self.assertTrue(np.isnan(result[-9:]).all()) - else: - self.assertTrue((result[-9:] == 0).all()) - if has_min_periods: - self.assertTrue(np.isnan(expected[23])) - self.assertTrue(np.isnan(result[14])) - self.assertTrue(np.isnan(expected[-5])) - self.assertTrue(np.isnan(result[-14])) + self.assert_numpy_array_equivalent(result, expected) if test_stable: result = func(self.arr + 1e9, window) @@ -488,11 +604,12 @@ def _check_structures(self, func, static_comp, assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) + # GH 7925 if has_center: if has_min_periods: minp = 10 - series_xp = func(self.series, 25, min_periods=minp).shift(-12) - frame_xp = func(self.frame, 25, min_periods=minp).shift(-12) + series_xp = func(self.series.reindex(list(self.series.index)+['x%d'%x for x in range(12)]), 25, min_periods=minp).shift(-12).reindex(self.series.index) + frame_xp = func(self.frame.reindex(list(self.frame.index)+['x%d'%x for x in range(12)]), 25, min_periods=minp).shift(-12).reindex(self.frame.index) series_rs = func(self.series, 25, min_periods=minp, center=True) @@ -500,8 +617,8 @@ def _check_structures(self, func, static_comp, center=True) else: - series_xp = func(self.series, 25).shift(-12) - frame_xp = func(self.frame, 25).shift(-12) + series_xp = func(self.series.reindex(list(self.series.index)+['x%d'%x for x in range(12)]), 25).shift(-12).reindex(self.series.index) + frame_xp = func(self.frame.reindex(list(self.frame.index)+['x%d'%x for x in range(12)]), 25).shift(-12).reindex(self.frame.index) series_rs = func(self.series, 25, center=True) frame_rs = func(self.frame, 25, center=True) @@ -521,7 +638,7 @@ def test_ewma(self): self.assertTrue(np.abs(result - 1) < 1e-2) s = Series([1.0, 2.0, 4.0, 8.0]) - + expected = Series([1.0, 1.6, 2.736842, 4.923077]) for f in [lambda s: mom.ewma(s, com=2.0, adjust=True), lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False), @@ -643,10 +760,9 @@ def _check_ew_ndarray(self, func, preserve_nan=False): self.assertTrue(np.isnan(result.values[:10]).all()) self.assertFalse(np.isnan(result.values[10:]).any()) else: - # ewmstd, ewmvol, ewmvar *should* require at least two values, - # but currently require only one, for some reason - self.assertTrue(np.isnan(result.values[:10]).all()) - self.assertFalse(np.isnan(result.values[10:]).any()) + # ewmstd, ewmvol, ewmvar (with bias=False) require at least two values + self.assertTrue(np.isnan(result.values[:11]).all()) + self.assertFalse(np.isnan(result.values[11:]).any()) # check series of length 0 result = func(Series([]), 50, min_periods=min_periods) @@ -657,9 +773,8 @@ def _check_ew_ndarray(self, func, preserve_nan=False): if func == mom.ewma: assert_series_equal(result, Series([1.])) else: - # ewmstd, ewmvol, ewmvar *should* require at least two values, - # so should return NaN, but currently require one, so return 0. - assert_series_equal(result, Series([0.])) + # ewmstd, ewmvol, ewmvar with bias=False require at least two values + assert_series_equal(result, Series([np.NaN])) # pass in ints result2 = func(np.arange(50), span=10) @@ -671,6 +786,439 @@ def _check_ew_structures(self, func): frame_result = func(self.frame, com=10) self.assertEqual(type(frame_result), DataFrame) +# create the data only once as we are not setting it +def _create_consistency_data(): + + def create_series(): + return [Series(), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.]), + Series([np.nan, 3.]), + Series([3., np.nan]), + Series([1., 3.]), + Series([2., 2.]), + Series([3., 1.]), + Series([5., 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan, np.nan]), + Series([np.nan, 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan, np.nan]), + Series([np.nan, np.nan, 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan, np.nan]), + Series([np.nan, 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., 12., 13., 14., 15.]), + Series([np.nan, 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., 12., 13., 14., 15.]), + Series([2., 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., 12., 13., 14., 15.]), + Series([2., 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., 12., 13., 14., 15.]), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [DataFrame(), + DataFrame(columns=['a']), + DataFrame(columns=['a', 'a']), + DataFrame(columns=['a', 'b']), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=['a', 'b', 99, 'd', 'd']), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notnull(values)])) == 1 + + def no_nans(x): + return x.notnull().all().all() + + # data is a tuple(object, is_contant, no_nans) + data = create_series() + create_dataframes() + + return [ (x, is_constant(x), no_nans(x)) for x in data ] +_consistency_data = _create_consistency_data() + +class TestMomentsConsistency(Base): + + def _create_data(self): + super(TestMomentsConsistency, self)._create_data() + self.data = _consistency_data + + def setUp(self): + self._create_data() + warnings.simplefilter("ignore", category=FutureWarning) + + def _test_moments_consistency(self, + min_periods, + count, mean, mock_mean, corr, + var_unbiased=None, std_unbiased=None, cov_unbiased=None, + var_biased=None, std_biased=None, cov_biased=None, + var_debiasing_factors=None): + + def _non_null_values(x): + values = x.values.ravel() + return set(values[notnull(values)].tolist()) + + for (x, is_constant, no_nans) in self.data: + assert_equal = assert_series_equal if isinstance(x, Series) else assert_frame_equal + count_x = count(x) + mean_x = mean(x) + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + assert_equal(mean_x, expected) + + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + # self.assertTrue(_non_null_values(corr_x_x).issubset(set([1.]))) # restore once rolling_cov(x, x) is identically equal to var(x) + + if is_constant: + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = x.max().max() + assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + assert_equal(corr_x_x, expected) + + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + for (std, var, cov) in [(std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased)]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + self.assertFalse((var_x < 0).any().any()) + self.assertFalse((std_x < 0).any().any()) + if cov: + cov_x_x = cov(x, x) + self.assertFalse((cov_x_x < 0).any().any()) + + # check that var(x) == cov(x, x) + assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + self.assertFalse((var_x > 0).any().any()) + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0. + if var is var_unbiased: + expected[count_x < 2] = np.nan + assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isnull().equals(y.isnull()): + # can only easily test two Series with similar structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * std(y)) + std_y = std(y) + assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) + + @slow + def test_ewm_consistency(self): + + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat([ _weights(s.iloc[:, i], + com=com, + adjust=adjust, + ignore_na=ignore_na) for i, _ in enumerate(s.columns) ], + axis=1) + w.index=s.index + w.columns=s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1. / (1. + com) + if ignore_na: + w[s.notnull()] = _weights(s[s.notnull()], com=com, adjust=adjust, ignore_na=False) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1. / (1. - alpha), i) + else: + sum_wts = 0. + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1. + else: + w.iat[i] = alpha * sum_wts / pow(1. - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method='ffill') + cum_sum_sq = (weights * weights).cumsum().fillna(method='ffill') + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = s.multiply(weights).cumsum().divide(weights.cumsum()).fillna(method='ffill') + result[mom.expanding_count(s) < (max(min_periods, 1) if min_periods else 1)] = np.nan + return result + + com = 3. + for min_periods in [0, 1, 2, 3, 4]: + for adjust in [True, False]: + for ignore_na in [False, True]: + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=mom.expanding_count, + mean=lambda x: mom.ewma(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na), + mock_mean=lambda x: _ewma(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na), + corr=lambda x, y: mom.ewmcorr(x, y, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na), + var_unbiased=lambda x: mom.ewmvar(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=False), + std_unbiased=lambda x: mom.ewmstd(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=False), + cov_unbiased=lambda x, y: mom.ewmcov(x, y, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=False), + var_biased=lambda x: mom.ewmvar(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=True), + std_biased=lambda x: mom.ewmstd(x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=True), + cov_biased=lambda x, y: mom.ewmcov(x, y, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na, bias=True), + var_debiasing_factors=lambda x: _variance_debiasing_factors(x, com=com, adjust=adjust, ignore_na=ignore_na)) + + @slow + def test_expanding_consistency(self): + base_functions = [ + (mom.expanding_count, lambda v: Series(v).count(), None), + (mom.expanding_max, lambda v: Series(v).max(), None), + (mom.expanding_min, lambda v: Series(v).min(), None), + (mom.expanding_sum, lambda v: Series(v).sum(), None), + (mom.expanding_mean, lambda v: Series(v).mean(), None), + (mom.expanding_std, lambda v: Series(v).std(), 1), + (mom.expanding_cov, lambda v: Series(v).cov(Series(v)), None), + (mom.expanding_corr, lambda v: Series(v).corr(Series(v)), None), + (mom.expanding_var, lambda v: Series(v).var(), 1), + #(mom.expanding_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed + #(mom.expanding_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed + #(lambda x, min_periods: mom.expanding_quantile(x, 0.3, min_periods=min_periods), + # lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed + (mom.expanding_median, lambda v: Series(v).median(), None), + (mom.expanding_max, np.nanmax, 1), + (mom.expanding_min, np.nanmin, 1), + (mom.expanding_sum, np.nansum, 1), + ] + if np.__version__ >= LooseVersion('1.8.0'): + base_functions += [ + (mom.expanding_mean, np.nanmean, 1), + (mom.expanding_std, lambda v: np.nanstd(v, ddof=1), 1), + (mom.expanding_var, lambda v: np.nanvar(v, ddof=1), 1), + ] + if np.__version__ >= LooseVersion('1.9.0'): + base_functions += [ + (mom.expanding_median, np.nanmedian, 1), + ] + no_nan_functions = [ + (mom.expanding_max, np.max, None), + (mom.expanding_min, np.min, None), + (mom.expanding_sum, np.sum, None), + (mom.expanding_mean, np.mean, None), + (mom.expanding_std, lambda v: np.std(v, ddof=1), 1), + (mom.expanding_var, lambda v: np.var(v, ddof=1), 1), + (mom.expanding_median, np.median, None), + ] + + # suppress warnings about empty slices, as we are deliberately testing with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning) + + for min_periods in [0, 1, 2, 3, 4]: + + # test consistency between different expanding_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=mom.expanding_count, + mean=lambda x: mom.expanding_mean(x, min_periods=min_periods), + mock_mean=lambda x: mom.expanding_sum(x, min_periods=min_periods) / mom.expanding_count(x), + corr=lambda x, y: mom.expanding_corr(x, y, min_periods=min_periods), + var_unbiased=lambda x: mom.expanding_var(x, min_periods=min_periods), + std_unbiased=lambda x: mom.expanding_std(x, min_periods=min_periods), + cov_unbiased=lambda x, y: mom.expanding_cov(x, y, min_periods=min_periods), + var_biased=lambda x: mom.expanding_var(x, min_periods=min_periods, ddof=0), + std_biased=lambda x: mom.expanding_std(x, min_periods=min_periods, ddof=0), + cov_biased=lambda x, y: mom.expanding_cov(x, y, min_periods=min_periods, ddof=0), + var_debiasing_factors=lambda x: mom.expanding_count(x) / (mom.expanding_count(x) - 1.).replace(0., np.nan) + ) + + # test consistency between expanding_xyz() and either (a) expanding_apply of Series.xyz(), + # or (b) expanding_apply of np.nanxyz() + for (x, is_constant, no_nans) in self.data: + assert_equal = assert_series_equal if isinstance(x, Series) else assert_frame_equal + functions = base_functions + + # GH 8269 + if no_nans: + functions = base_functions + no_nan_functions + for (expanding_f, f, require_min_periods) in functions: + if require_min_periods and (min_periods is not None) and (min_periods < require_min_periods): + continue + + if expanding_f is mom.expanding_count: + expanding_f_result = expanding_f(x) + expanding_apply_f_result = mom.expanding_apply(x, func=f, min_periods=0) + else: + if expanding_f in [mom.expanding_cov, mom.expanding_corr]: + expanding_f_result = expanding_f(x, min_periods=min_periods, pairwise=False) + else: + expanding_f_result = expanding_f(x, min_periods=min_periods) + expanding_apply_f_result = mom.expanding_apply(x, func=f, min_periods=min_periods) + assert_equal(expanding_f_result, expanding_apply_f_result) + + if (expanding_f in [mom.expanding_cov, mom.expanding_corr]) and isinstance(x, DataFrame): + # test pairwise=True + expanding_f_result = expanding_f(x, x, min_periods=min_periods, pairwise=True) + expected = Panel(items=x.index, major_axis=x.columns, minor_axis=x.columns) + for i, _ in enumerate(x.columns): + for j, _ in enumerate(x.columns): + expected.iloc[:, i, j] = expanding_f(x.iloc[:, i], x.iloc[:, j], min_periods=min_periods) + assert_panel_equal(expanding_f_result, expected) + + @slow + def test_rolling_consistency(self): + + base_functions = [ + (mom.rolling_count, lambda v: Series(v).count(), None), + (mom.rolling_max, lambda v: Series(v).max(), None), + (mom.rolling_min, lambda v: Series(v).min(), None), + (mom.rolling_sum, lambda v: Series(v).sum(), None), + (mom.rolling_mean, lambda v: Series(v).mean(), None), + (mom.rolling_std, lambda v: Series(v).std(), 1), + (mom.rolling_cov, lambda v: Series(v).cov(Series(v)), None), + (mom.rolling_corr, lambda v: Series(v).corr(Series(v)), None), + (mom.rolling_var, lambda v: Series(v).var(), 1), + #(mom.rolling_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed + #(mom.rolling_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed + #(lambda x, window, min_periods, center: mom.rolling_quantile(x, window, 0.3, min_periods=min_periods, center=center), + # lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed + (mom.rolling_median, lambda v: Series(v).median(), None), + (mom.rolling_max, np.nanmax, 1), + (mom.rolling_min, np.nanmin, 1), + (mom.rolling_sum, np.nansum, 1), + ] + if np.__version__ >= LooseVersion('1.8.0'): + base_functions += [ + (mom.rolling_mean, np.nanmean, 1), + (mom.rolling_std, lambda v: np.nanstd(v, ddof=1), 1), + (mom.rolling_var, lambda v: np.nanvar(v, ddof=1), 1), + ] + if np.__version__ >= LooseVersion('1.9.0'): + base_functions += [ + (mom.rolling_median, np.nanmedian, 1), + ] + no_nan_functions = [ + (mom.rolling_max, np.max, None), + (mom.rolling_min, np.min, None), + (mom.rolling_sum, np.sum, None), + (mom.rolling_mean, np.mean, None), + (mom.rolling_std, lambda v: np.std(v, ddof=1), 1), + (mom.rolling_var, lambda v: np.var(v, ddof=1), 1), + (mom.rolling_median, np.median, None), + ] + + for window in [1, 2, 3, 10, 20]: + for min_periods in set([0, 1, 2, 3, 4, window]): + if min_periods and (min_periods > window): + continue + for center in [False, True]: + + # test consistency between different rolling_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: mom.rolling_count(x, window=window, center=center), + mean=lambda x: mom.rolling_mean(x, window=window, min_periods=min_periods, center=center), + mock_mean=lambda x: mom.rolling_sum(x, window=window, min_periods=min_periods, center=center).divide( + mom.rolling_count(x, window=window, center=center)), + corr=lambda x, y: mom.rolling_corr(x, y, window=window, min_periods=min_periods, center=center), + var_unbiased=lambda x: mom.rolling_var(x, window=window, min_periods=min_periods, center=center), + std_unbiased=lambda x: mom.rolling_std(x, window=window, min_periods=min_periods, center=center), + cov_unbiased=lambda x, y: mom.rolling_cov(x, y, window=window, min_periods=min_periods, center=center), + var_biased=lambda x: mom.rolling_var(x, window=window, min_periods=min_periods, center=center, ddof=0), + std_biased=lambda x: mom.rolling_std(x, window=window, min_periods=min_periods, center=center, ddof=0), + cov_biased=lambda x, y: mom.rolling_cov(x, y, window=window, min_periods=min_periods, center=center, ddof=0), + var_debiasing_factors=lambda x: mom.rolling_count(x, window=window, center=center).divide( + (mom.rolling_count(x, window=window, center=center) - 1.).replace(0., np.nan)), + ) + + # test consistency between rolling_xyz() and either (a) rolling_apply of Series.xyz(), + # or (b) rolling_apply of np.nanxyz() + for (x, is_constant, no_nans) in self.data: + + assert_equal = assert_series_equal if isinstance(x, Series) else assert_frame_equal + functions = base_functions + # GH 8269 + if no_nans: + functions = base_functions + no_nan_functions + for (rolling_f, f, require_min_periods) in functions: + if require_min_periods and (min_periods is not None) and (min_periods < require_min_periods): + continue + + if rolling_f is mom.rolling_count: + rolling_f_result = rolling_f(x, window=window, center=center) + rolling_apply_f_result = mom.rolling_apply(x, window=window, func=f, + min_periods=0, center=center) + else: + if rolling_f in [mom.rolling_cov, mom.rolling_corr]: + rolling_f_result = rolling_f(x, window=window, min_periods=min_periods, center=center, pairwise=False) + else: + rolling_f_result = rolling_f(x, window=window, min_periods=min_periods, center=center) + rolling_apply_f_result = mom.rolling_apply(x, window=window, func=f, + min_periods=min_periods, center=center) + assert_equal(rolling_f_result, rolling_apply_f_result) + + if (rolling_f in [mom.rolling_cov, mom.rolling_corr]) and isinstance(x, DataFrame): + # test pairwise=True + rolling_f_result = rolling_f(x, x, window=window, min_periods=min_periods, + center=center, pairwise=True) + expected = Panel(items=x.index, major_axis=x.columns, minor_axis=x.columns) + for i, _ in enumerate(x.columns): + for j, _ in enumerate(x.columns): + expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j], + window=window, min_periods=min_periods, center=center) + assert_panel_equal(rolling_f_result, expected) + # binary moments def test_rolling_cov(self): A = self.series @@ -787,14 +1335,9 @@ def _check_binary_ew(self, func): # GH 7898 for min_periods in (0, 1, 2): result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) *should* require at least two values - if (func == mom.ewmcov) and (min_periods <= 1): - # currenty ewmcov requires only one value, for some reason. - self.assertTrue(np.isnan(result.values[:10]).all()) - self.assertFalse(np.isnan(result.values[10:]).any()) - else: - self.assertTrue(np.isnan(result.values[:11]).all()) - self.assertFalse(np.isnan(result.values[11:]).any()) + # binary functions (ewmcov, ewmcorr) with bias=False require at least two values + self.assertTrue(np.isnan(result.values[:11]).all()) + self.assertFalse(np.isnan(result.values[11:]).any()) # check series of length 0 result = func(Series([]), Series([]), 50, min_periods=min_periods) @@ -802,11 +1345,7 @@ def _check_binary_ew(self, func): # check series of length 1 result = func(Series([1.]), Series([1.]), 50, min_periods=min_periods) - if (func == mom.ewmcov) and (min_periods <= 1): - # currenty ewmcov requires only one value, for some reason. - assert_series_equal(result, Series([0.])) - else: - assert_series_equal(result, Series([np.NaN])) + assert_series_equal(result, Series([np.NaN])) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) @@ -821,6 +1360,12 @@ def expanding_mean(x, min_periods=1, freq=None): freq=freq) self._check_expanding(expanding_mean, np.mean) + # GH 8080 + s = Series([None, None, None]) + result = mom.expanding_apply(s, lambda x: len(x), min_periods=0) + expected = Series([1., 2., 3.]) + assert_series_equal(result, expected) + def test_expanding_apply_args_kwargs(self): def mean_w_arg(x, const): return np.mean(x) + const @@ -996,6 +1541,77 @@ def test_rolling_functions_window_non_shrinkage(self): df_result_panel = f(df) assert_panel_equal(df_result_panel, df_expected_panel) + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series() + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df1_expected_panel = Panel(items=df1.index, major_axis=df1.columns, minor_axis=df1.columns) + df2 = DataFrame(columns=['a']) + df2_expected = df2 + df2_expected_panel = Panel(items=df2.index, major_axis=df2.columns, minor_axis=df2.columns) + + functions = [lambda x: mom.expanding_count(x), + lambda x: mom.expanding_cov(x, x, pairwise=False, min_periods=5), + lambda x: mom.expanding_corr(x, x, pairwise=False, min_periods=5), + lambda x: mom.expanding_max(x, min_periods=5), + lambda x: mom.expanding_min(x, min_periods=5), + lambda x: mom.expanding_sum(x, min_periods=5), + lambda x: mom.expanding_mean(x, min_periods=5), + lambda x: mom.expanding_std(x, min_periods=5), + lambda x: mom.expanding_var(x, min_periods=5), + lambda x: mom.expanding_skew(x, min_periods=5), + lambda x: mom.expanding_kurt(x, min_periods=5), + lambda x: mom.expanding_quantile(x, quantile=0.5, min_periods=5), + lambda x: mom.expanding_median(x, min_periods=5), + lambda x: mom.expanding_apply(x, func=sum, min_periods=5), + lambda x: mom.rolling_count(x, window=10), + lambda x: mom.rolling_cov(x, x, pairwise=False, window=10, min_periods=5), + lambda x: mom.rolling_corr(x, x, pairwise=False, window=10, min_periods=5), + lambda x: mom.rolling_max(x, window=10, min_periods=5), + lambda x: mom.rolling_min(x, window=10, min_periods=5), + lambda x: mom.rolling_sum(x, window=10, min_periods=5), + lambda x: mom.rolling_mean(x, window=10, min_periods=5), + lambda x: mom.rolling_std(x, window=10, min_periods=5), + lambda x: mom.rolling_var(x, window=10, min_periods=5), + lambda x: mom.rolling_skew(x, window=10, min_periods=5), + lambda x: mom.rolling_kurt(x, window=10, min_periods=5), + lambda x: mom.rolling_quantile(x, quantile=0.5, window=10, min_periods=5), + lambda x: mom.rolling_median(x, window=10, min_periods=5), + lambda x: mom.rolling_apply(x, func=sum, window=10, min_periods=5), + lambda x: mom.rolling_window(x, win_type='boxcar', window=10, min_periods=5), + ] + for f in functions: + try: + s_result = f(s) + assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + functions = [lambda x: mom.expanding_cov(x, x, pairwise=True, min_periods=5), + lambda x: mom.expanding_corr(x, x, pairwise=True, min_periods=5), + lambda x: mom.rolling_cov(x, x, pairwise=True, window=10, min_periods=5), + lambda x: mom.rolling_corr(x, x, pairwise=True, window=10, min_periods=5), + # rolling_corr_pairwise is depracated, so the following line should be deleted + # when rolling_corr_pairwise is removed. + lambda x: mom.rolling_corr_pairwise(x, x, window=10, min_periods=5), + ] + for f in functions: + df1_result_panel = f(df1) + assert_panel_equal(df1_result_panel, df1_expected_panel) + + df2_result_panel = f(df2) + assert_panel_equal(df2_result_panel, df2_expected_panel) + def test_expanding_cov_pairwise_diff_length(self): # GH 7512 df1 = DataFrame([[1,5], [3, 2], [3,9]], columns=['A','B']) @@ -1027,7 +1643,7 @@ def test_expanding_corr_pairwise_diff_length(self): assert_frame_equal(result2, expected) assert_frame_equal(result3, expected) assert_frame_equal(result4, expected) - + def test_pairwise_stats_column_names_order(self): # GH 7738 df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]), @@ -1044,102 +1660,106 @@ def test_pairwise_stats_column_names_order(self): df2 = DataFrame([[None,1,1],[None,1,2],[None,3,2],[None,8,1]], columns=['Y','Z','X']) s = Series([1,1,3,8]) - # DataFrame methods (which do not call _flex_binary_moment()) - for f in [lambda x: x.cov(), - lambda x: x.corr(), - ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.columns) - assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equivalent(result, results[0]) - - # DataFrame with itself, pairwise=True - for f in [lambda x: mom.expanding_cov(x, pairwise=True), - lambda x: mom.expanding_corr(x, pairwise=True), - lambda x: mom.rolling_cov(x, window=3, pairwise=True), - lambda x: mom.rolling_corr(x, window=3, pairwise=True), - lambda x: mom.ewmcov(x, com=3, pairwise=True), - lambda x: mom.ewmcorr(x, com=3, pairwise=True), - ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equivalent(result, results[0]) - - # DataFrame with itself, pairwise=False - for f in [lambda x: mom.expanding_cov(x, pairwise=False), - lambda x: mom.expanding_corr(x, pairwise=False), - lambda x: mom.rolling_cov(x, window=3, pairwise=False), - lambda x: mom.rolling_corr(x, window=3, pairwise=False), - lambda x: mom.ewmcov(x, com=3, pairwise=False), - lambda x: mom.ewmcorr(x, com=3, pairwise=False), - ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equivalent(result, results[0]) - - # DataFrame with another DataFrame, pairwise=True - for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True), - lambda x, y: mom.expanding_corr(x, y, pairwise=True), - lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True), - lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True), - lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True), - lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True), - ]: - results = [f(df, df2) for df in df1s] - for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df2.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equivalent(result, results[0]) - - # DataFrame with another DataFrame, pairwise=False - for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False), - lambda x, y: mom.expanding_corr(x, y, pairwise=False), - lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False), - lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False), - lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False), - lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False), - ]: - results = [f(df, df2) if df.columns.is_unique else None for df in df1s] - for (df, result) in zip(df1s, results): - if result is not None: - expected_index = df.index.union(df2.index) - expected_columns = df.columns.union(df2.columns) - assert_index_equal(result.index, expected_index) - assert_index_equal(result.columns, expected_columns) - else: - tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2) - tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df) - - # DataFrame with a Series - for f in [lambda x, y: mom.expanding_cov(x, y), - lambda x, y: mom.expanding_corr(x, y), - lambda x, y: mom.rolling_cov(x, y, window=3), - lambda x, y: mom.rolling_corr(x, y, window=3), - lambda x, y: mom.ewmcov(x, y, com=3), - lambda x, y: mom.ewmcorr(x, y, com=3), - ]: - results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] - for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equivalent(result, results[0]) + # suppress warnings about incomparable objects, as we are deliberately testing with such column labels + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*incomparable objects.*", category=RuntimeWarning) + + # DataFrame methods (which do not call _flex_binary_moment()) + for f in [lambda x: x.cov(), + lambda x: x.corr(), + ]: + results = [f(df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.index, df.columns) + assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with itself, pairwise=True + for f in [lambda x: mom.expanding_cov(x, pairwise=True), + lambda x: mom.expanding_corr(x, pairwise=True), + lambda x: mom.rolling_cov(x, window=3, pairwise=True), + lambda x: mom.rolling_corr(x, window=3, pairwise=True), + lambda x: mom.ewmcov(x, com=3, pairwise=True), + lambda x: mom.ewmcorr(x, com=3, pairwise=True), + ]: + results = [f(df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.items, df.index) + assert_index_equal(result.major_axis, df.columns) + assert_index_equal(result.minor_axis, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with itself, pairwise=False + for f in [lambda x: mom.expanding_cov(x, pairwise=False), + lambda x: mom.expanding_corr(x, pairwise=False), + lambda x: mom.rolling_cov(x, window=3, pairwise=False), + lambda x: mom.rolling_corr(x, window=3, pairwise=False), + lambda x: mom.ewmcov(x, com=3, pairwise=False), + lambda x: mom.ewmcorr(x, com=3, pairwise=False), + ]: + results = [f(df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.index, df.index) + assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with another DataFrame, pairwise=True + for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True), + lambda x, y: mom.expanding_corr(x, y, pairwise=True), + lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True), + lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True), + lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True), + lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True), + ]: + results = [f(df, df2) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.items, df.index) + assert_index_equal(result.major_axis, df.columns) + assert_index_equal(result.minor_axis, df2.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with another DataFrame, pairwise=False + for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False), + lambda x, y: mom.expanding_corr(x, y, pairwise=False), + lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False), + lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False), + lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False), + lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False), + ]: + results = [f(df, df2) if df.columns.is_unique else None for df in df1s] + for (df, result) in zip(df1s, results): + if result is not None: + expected_index = df.index.union(df2.index) + expected_columns = df.columns.union(df2.columns) + assert_index_equal(result.index, expected_index) + assert_index_equal(result.columns, expected_columns) + else: + tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2) + tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df) + + # DataFrame with a Series + for f in [lambda x, y: mom.expanding_cov(x, y), + lambda x, y: mom.expanding_corr(x, y), + lambda x, y: mom.rolling_cov(x, y, window=3), + lambda x, y: mom.rolling_corr(x, y, window=3), + lambda x, y: mom.ewmcov(x, y, com=3), + lambda x, y: mom.ewmcorr(x, y, com=3), + ]: + results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.index, df.index) + assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) def test_rolling_skew_edge_cases(self): diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 5a34048fd8c8c..5c8d47ec2a82a 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -682,13 +682,15 @@ def testRollingWithTimeCluster(self): cluster='time') def testRollingWithNeweyWestAndEntityCluster(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - nw_lags=1, cluster='entity') + self.assertRaises(ValueError, self.checkMovingOLS, + self.panel_x, self.panel_y, + nw_lags=1, cluster='entity') def testRollingWithNeweyWestAndTimeEffectsAndEntityCluster(self): - self.checkMovingOLS(self.panel_x, self.panel_y, - nw_lags=1, cluster='entity', - time_effects=True) + self.assertRaises(ValueError, + self.checkMovingOLS, self.panel_x, self.panel_y, + nw_lags=1, cluster='entity', + time_effects=True) def testExpanding(self): self.checkMovingOLS( diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/tests/data/categorical_0_14_1.pickle new file mode 100644 index 0000000000000..94f882b2f3027 --- /dev/null +++ b/pandas/tests/data/categorical_0_14_1.pickle @@ -0,0 +1,94 @@ +ccopy_reg +_reconstructor +p0 +(cpandas.core.categorical +Categorical +p1 +c__builtin__ +object +p2 +Ntp3 +Rp4 +(dp5 +S'_levels' +p6 +cnumpy.core.multiarray +_reconstruct +p7 +(cpandas.core.index +Index +p8 +(I0 +tp9 +S'b' +p10 +tp11 +Rp12 +((I1 +(I4 +tp13 +cnumpy +dtype +p14 +(S'O8' +p15 +I0 +I1 +tp16 +Rp17 +(I3 +S'|' +p18 +NNNI-1 +I-1 +I63 +tp19 +bI00 +(lp20 +S'a' +p21 +ag10 +aS'c' +p22 +aS'd' +p23 +atp24 +(Ntp25 +tp26 +bsS'labels' +p27 +g7 +(cnumpy +ndarray +p28 +(I0 +tp29 +g10 +tp30 +Rp31 +(I1 +(I3 +tp32 +g14 +(S'i8' +p33 +I0 +I1 +tp34 +Rp35 +(I3 +S'<' +p36 +NNNI-1 +I-1 +I0 +tp37 +bI00 +S'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00' +p38 +tp39 +bsS'name' +p40 +S'foobar' +p41 +sb. \ No newline at end of file diff --git a/pandas/tests/data/s1-0.12.0.pickle b/pandas/tests/data/s1-0.12.0.pickle new file mode 100644 index 0000000000000..0ce9cfdf3aa94 Binary files /dev/null and b/pandas/tests/data/s1-0.12.0.pickle differ diff --git a/pandas/tests/data/s2-0.12.0.pickle b/pandas/tests/data/s2-0.12.0.pickle new file mode 100644 index 0000000000000..2318be2d9978b Binary files /dev/null and b/pandas/tests/data/s2-0.12.0.pickle differ diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 8b0605dd391be..814da043d0319 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -5,10 +5,11 @@ import pandas.compat as compat import pandas as pd from pandas.compat import u, StringIO -from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, DatetimeIndexOpsMixin +from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate +from pandas.tseries.base import DatetimeIndexOpsMixin from pandas.util.testing import assertRaisesRegexp, assert_isinstance from pandas.tseries.common import is_datetimelike -from pandas import Series, Index, Int64Index, DatetimeIndex, PeriodIndex +from pandas import Series, Index, Int64Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta import pandas.tslib as tslib import nose @@ -179,6 +180,7 @@ def f(): class Ops(tm.TestCase): def setUp(self): + self.bool_index = tm.makeBoolIndex(10) self.int_index = tm.makeIntIndex(10) self.float_index = tm.makeFloatIndex(10) self.dt_index = tm.makeDateIndex(10) @@ -188,14 +190,15 @@ def setUp(self): arr = np.random.randn(10) self.int_series = Series(arr, index=self.int_index) - self.float_series = Series(arr, index=self.int_index) + self.float_series = Series(arr, index=self.float_index) self.dt_series = Series(arr, index=self.dt_index) self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) self.period_series = Series(arr, index=self.period_index) self.string_series = Series(arr, index=self.string_index) - types = ['int','float','dt', 'dt_tz', 'period','string'] - self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in types for f in ['index','series'] ] + types = ['bool','int','float','dt', 'dt_tz', 'period','string'] + fmts = [ "{0}_{1}".format(t,f) for t in types for f in ['index','series'] ] + self.objs = [ getattr(self,f) for f in fmts if getattr(self,f,None) is not None ] def check_ops_properties(self, props, filter=None, ignore_failures=False): for op in props: @@ -339,6 +342,9 @@ def test_value_counts_unique_nunique(self): # freq must be specified because repeat makes freq ambiguous expected_index = o[::-1] o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) + # don't test boolean + elif isinstance(o,Index) and o.is_boolean(): + continue elif isinstance(o, Index): expected_index = values[::-1] o = klass(np.repeat(values, range(1, len(o) + 1))) @@ -365,6 +371,10 @@ def test_value_counts_unique_nunique(self): klass = type(o) values = o.values + if isinstance(o,Index) and o.is_boolean(): + # don't test boolean + continue + if ((isinstance(o, Int64Index) and not isinstance(o, (DatetimeIndex, PeriodIndex)))): # skips int64 because it doesn't allow to include nan or None @@ -519,28 +529,31 @@ def test_value_counts_inferred(self): td = klass(td) result = td.value_counts() - expected_s = Series([6], index=[86400000000000]) - self.assertEqual(result.index.dtype, 'int64') + expected_s = Series([6], index=[Timedelta('1day')]) tm.assert_series_equal(result, expected_s) - # get nanoseconds to compare - expected = np.array([86400000000000]) - self.assert_numpy_array_equal(td.unique(), expected) - self.assertEqual(td.nunique(), 1) + expected = TimedeltaIndex(['1 days']) + if isinstance(td, TimedeltaIndex): + self.assertTrue(td.unique().equals(expected)) + else: + self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() - self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) - self.assert_numpy_array_equal(td.unique(), expected) - self.assertEqual(td.nunique(), 1) - def test_factorize(self): for o in self.objs: - exp_arr = np.array(range(len(o))) + + if isinstance(o,Index) and o.is_boolean(): + exp_arr = np.array([0,1] + [0] * 8) + exp_uniques = o + exp_uniques = Index([False,True]) + else: + exp_arr = np.array(range(len(o))) + exp_uniques = o labels, uniques = o.factorize() self.assert_numpy_array_equal(labels, exp_arr) @@ -548,16 +561,22 @@ def test_factorize(self): expected = Index(o.values) self.assert_numpy_array_equal(uniques, expected) else: - self.assertTrue(uniques.equals(o)) + self.assertTrue(uniques.equals(exp_uniques)) for o in self.objs: + + # don't test boolean + if isinstance(o,Index) and o.is_boolean(): + continue + # sort by value, and create duplicates if isinstance(o, Series): o.sort() + n = o.iloc[5:].append(o) else: indexer = o.argsort() o = o.take(indexer) - n = o[5:].append(o) + n = o[5:].append(o) exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) labels, uniques = n.factorize(sort=True) @@ -585,6 +604,14 @@ def test_duplicated_drop_duplicates(self): for original in self.objs: if isinstance(original, Index): + + # special case + if original.is_boolean(): + result = original.drop_duplicates() + expected = Index([False,True]) + tm.assert_index_equal(result, expected) + continue + # original doesn't have duplicates expected = Index([False] * len(original)) tm.assert_index_equal(original.duplicated(), expected) @@ -637,653 +664,6 @@ def test_duplicated_drop_duplicates(self): s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) - -class TestDatetimeIndexOps(Ops): - tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/Asia/Singapore', 'dateutil/US/Pacific'] - - def setUp(self): - super(TestDatetimeIndexOps, self).setUp() - mask = lambda x: isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex) or is_datetimelike(x) - self.is_valid_objs = [ o for o in self.objs if mask(o) ] - self.not_valid_objs = [ o for o in self.objs if not mask(o) ] - - def test_ops_properties(self): - self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) - self.check_ops_properties(['date','time','microsecond','nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', - 'is_quarter_end', 'is_year_start', 'is_year_end'], lambda x: isinstance(x,DatetimeIndex)) - - def test_ops_properties_basic(self): - - # sanity check that the behavior didn't change - # GH7206 - for op in ['year','day','second','weekday']: - self.assertRaises(TypeError, lambda x: getattr(self.dt_series,op)) - - # attribute access should still work! - s = Series(dict(year=2000,month=1,day=10)) - self.assertEquals(s.year,2000) - self.assertEquals(s.month,1) - self.assertEquals(s.day,10) - self.assertRaises(AttributeError, lambda : s.weekday) - - def test_asobject_tolist(self): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') - expected_list = [pd.Timestamp('2013-01-31'), pd.Timestamp('2013-02-28'), - pd.Timestamp('2013-03-31'), pd.Timestamp('2013-04-30')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx', tz='Asia/Tokyo') - expected_list = [pd.Timestamp('2013-01-31', tz='Asia/Tokyo'), - pd.Timestamp('2013-02-28', tz='Asia/Tokyo'), - pd.Timestamp('2013-03-31', tz='Asia/Tokyo'), - pd.Timestamp('2013-04-30', tz='Asia/Tokyo')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [pd.Timestamp('2013-01-01'), pd.Timestamp('2013-01-02'), - pd.NaT, pd.Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - def test_minmax(self): - for tz in self.tz: - # monotonic - idx1 = pd.DatetimeIndex([pd.NaT, '2011-01-01', '2011-01-02', - '2011-01-03'], tz=tz) - self.assertTrue(idx1.is_monotonic) - - # non-monotonic - idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', - '2011-01-02', pd.NaT], tz=tz) - self.assertFalse(idx2.is_monotonic) - - for idx in [idx1, idx2]: - self.assertEqual(idx.min(), pd.Timestamp('2011-01-01', tz=tz)) - self.assertEqual(idx.max(), pd.Timestamp('2011-01-03', tz=tz)) - - for op in ['min', 'max']: - # Return NaT - obj = DatetimeIndex([]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = DatetimeIndex([pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) - self.assertTrue(pd.isnull(getattr(obj, op)())) - - def test_representation(self): - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], - freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - - exp1 = """ -Length: 0, Freq: D, Timezone: None""" - exp2 = """ -[2011-01-01] -Length: 1, Freq: D, Timezone: None""" - exp3 = """ -[2011-01-01, 2011-01-02] -Length: 2, Freq: D, Timezone: None""" - exp4 = """ -[2011-01-01, ..., 2011-01-03] -Length: 3, Freq: D, Timezone: None""" - exp5 = """ -[2011-01-01 09:00:00+09:00, ..., 2011-01-01 11:00:00+09:00] -Length: 3, Freq: H, Timezone: Asia/Tokyo""" - exp6 = """ -[2011-01-01 09:00:00-05:00, ..., NaT] -Length: 3, Freq: None, Timezone: US/Eastern""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], - [exp1, exp2, exp3, exp4, exp5, exp6]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - self.assertEqual(result, expected) - - def test_resolution(self): - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', - 'hour', 'minute', 'second', 'millisecond', 'microsecond']): - for tz in [None, 'Asia/Tokyo', 'US/Eastern']: - idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) - self.assertEqual(idx.resolution, expected) - - def test_add_iadd(self): - for tz in self.tz: - # union - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), - (rng3, other3, expected3)]: - result_add = rng + other - result_union = rng.union(other) - - tm.assert_index_equal(result_add, expected) - tm.assert_index_equal(result_union, expected) - rng += other - tm.assert_index_equal(rng, expected) - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h')] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng + delta - expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, tz=tz) - result = rng + 1 - expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, tz=tz) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - def test_sub_isub(self): - for tz in self.tz: - # diff - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) - - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - - for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), - (rng3, other3, expected3)]: - result_add = rng - other - result_union = rng.diff(other) - - tm.assert_index_equal(result_add, expected) - tm.assert_index_equal(result_union, expected) - rng -= other - tm.assert_index_equal(rng, expected) - - # offset - offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h')] - - for delta in offsets: - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng - delta - expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - # int - rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, tz=tz) - result = rng - 1 - expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, tz=tz) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - def test_value_counts_unique(self): - # GH 7735 - for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: - idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) - - exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - - expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) - tm.assert_index_equal(idx.unique(), expected) - - idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) - expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - -class TestPeriodIndexOps(Ops): - - def setUp(self): - super(TestPeriodIndexOps, self).setUp() - mask = lambda x: isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex) or is_datetimelike(x) - self.is_valid_objs = [ o for o in self.objs if mask(o) ] - self.not_valid_objs = [ o for o in self.objs if not mask(o) ] - - def test_ops_properties(self): - self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) - self.check_ops_properties(['qyear'], lambda x: isinstance(x,PeriodIndex)) - - def test_asobject_tolist(self): - idx = pd.period_range(start='2013-01-01', periods=4, freq='M', name='idx') - expected_list = [pd.Period('2013-01-31', freq='M'), pd.Period('2013-02-28', freq='M'), - pd.Period('2013-03-31', freq='M'), pd.Period('2013-04-30', freq='M')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) - self.assertEqual(result.name, expected.name) - self.assertEqual(idx.tolist(), expected_list) - - idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', '2013-01-04'], freq='D', name='idx') - expected_list = [pd.Period('2013-01-01', freq='D'), pd.Period('2013-01-02', freq='D'), - pd.Period('NaT', freq='D'), pd.Period('2013-01-04', freq='D')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.asobject - self.assertTrue(isinstance(result, Index)) - self.assertEqual(result.dtype, object) - for i in [0, 1, 3]: - self.assertTrue(result[i], expected[i]) - self.assertTrue(result[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result[2].freq, 'D') - self.assertEqual(result.name, expected.name) - - result_list = idx.tolist() - for i in [0, 1, 3]: - self.assertTrue(result_list[i], expected_list[i]) - self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result_list[2].freq, 'D') - - def test_minmax(self): - - # monotonic - idx1 = pd.PeriodIndex([pd.NaT, '2011-01-01', '2011-01-02', - '2011-01-03'], freq='D') - self.assertTrue(idx1.is_monotonic) - - # non-monotonic - idx2 = pd.PeriodIndex(['2011-01-01', pd.NaT, '2011-01-03', - '2011-01-02', pd.NaT], freq='D') - self.assertFalse(idx2.is_monotonic) - - for idx in [idx1, idx2]: - self.assertEqual(idx.min(), pd.Period('2011-01-01', freq='D')) - self.assertEqual(idx.max(), pd.Period('2011-01-03', freq='D')) - - for op in ['min', 'max']: - # Return NaT - obj = PeriodIndex([], freq='M') - result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - - obj = PeriodIndex([pd.NaT], freq='M') - result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - - obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') - result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - - def test_representation(self): - # GH 7601 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - - exp1 = """ -Length: 0, Freq: D""" - exp2 = """ -[2011-01-01] -Length: 1, Freq: D""" - exp3 = """ -[2011-01-01, 2011-01-02] -Length: 2, Freq: D""" - exp4 = """ -[2011-01-01, ..., 2011-01-03] -Length: 3, Freq: D""" - exp5 = """ -[2011, ..., 2013] -Length: 3, Freq: A-DEC""" - exp6 = """ -[2011-01-01 09:00, ..., NaT] -Length: 3, Freq: H""" - exp7 = """ -[2013Q1] -Length: 1, Freq: Q-DEC""" - exp8 = """ -[2013Q1, 2013Q2] -Length: 2, Freq: Q-DEC""" - exp9 = """ -[2013Q1, ..., 2013Q3] -Length: 3, Freq: Q-DEC""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - self.assertEqual(result, expected) - - def test_resolution(self): - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', - 'hour', 'minute', 'second', 'millisecond', 'microsecond']): - - idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) - self.assertEqual(idx.resolution, expected) - - def test_add_iadd(self): - # union - rng1 = pd.period_range('1/1/2000', freq='D', periods=5) - other1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.period_range('1/1/2000', freq='D', periods=10) - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=8) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', '2000-01-01 12:00', - '2000-01-01 13:00', '2000-01-02 09:00', - '2000-01-02 10:00', '2000-01-02 11:00', - '2000-01-02 12:00', '2000-01-02 13:00'], - freq='H') - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05', '2000-01-01 09:08'], - freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=10) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.period_range('1998-01-01', freq='A', periods=10) - - for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), - (rng3, other3, expected3), (rng4, other4, expected4), - (rng5, other5, expected5), (rng6, other6, expected6), - (rng7, other7, expected7)]: - - result_add = rng + other - result_union = rng.union(other) - - tm.assert_index_equal(result_add, expected) - tm.assert_index_equal(result_union, expected) - # GH 6527 - rng += other - tm.assert_index_equal(rng, expected) - - # offset - # DateOffset - rng = pd.period_range('2014', '2024', freq='A') - result = rng + pd.offsets.YearEnd(5) - expected = pd.period_range('2019', '2029', freq='A') - tm.assert_index_equal(result, expected) - rng += pd.offsets.YearEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng + o - - rng = pd.period_range('2014-01', '2016-12', freq='M') - result = rng + pd.offsets.MonthEnd(5) - expected = pd.period_range('2014-06', '2017-05', freq='M') - tm.assert_index_equal(result, expected) - rng += pd.offsets.MonthEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(365, 'D'), timedelta(365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng + o - - # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), np.timedelta64(3, 'D'), - pd.offsets.Hour(72), timedelta(minutes=60*24*3), np.timedelta64(72, 'h')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng + delta - expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(4, 'h'), timedelta(hours=23)]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng + o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), - pd.offsets.Minute(120), timedelta(minutes=120), np.timedelta64(120, 'm')] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - result = rng + delta - expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', freq='H') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), np.timedelta64(30, 's')]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - result = rng + delta - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng += delta - - # int - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - result = rng + 1 - expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng += 1 - tm.assert_index_equal(rng, expected) - - def test_sub_isub(self): - # diff - rng1 = pd.period_range('1/1/2000', freq='D', periods=5) - other1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=3) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = rng4 - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=3) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.period_range('2006-01-01', freq='A', periods=2) - - for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), - (rng3, other3, expected3), (rng4, other4, expected4), - (rng5, other5, expected5), (rng6, other6, expected6), - (rng7, other7, expected7),]: - result_add = rng - other - result_union = rng.diff(other) - - tm.assert_index_equal(result_add, expected) - tm.assert_index_equal(result_union, expected) - rng -= other - tm.assert_index_equal(rng, expected) - - # offset - # DateOffset - rng = pd.period_range('2014', '2024', freq='A') - result = rng - pd.offsets.YearEnd(5) - expected = pd.period_range('2009', '2019', freq='A') - tm.assert_index_equal(result, expected) - rng -= pd.offsets.YearEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(365, 'D'), timedelta(365)]: - rng = pd.period_range('2014', '2024', freq='A') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng - o - - rng = pd.period_range('2014-01', '2016-12', freq='M') - result = rng - pd.offsets.MonthEnd(5) - expected = pd.period_range('2013-08', '2016-07', freq='M') - tm.assert_index_equal(result, expected) - rng -= pd.offsets.MonthEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(365, 'D'), timedelta(365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng - o - - # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), np.timedelta64(3, 'D'), - pd.offsets.Hour(72), timedelta(minutes=60*24*3), np.timedelta64(72, 'h')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng - delta - expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), - np.timedelta64(4, 'h'), timedelta(hours=23)]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng - o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), - pd.offsets.Minute(120), timedelta(minutes=120), np.timedelta64(120, 'm')] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - result = rng - delta - expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', freq='H') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), np.timedelta64(30, 's')]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - result = rng + delta - with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): - rng += delta - - # int - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - result = rng - 1 - expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) - tm.assert_index_equal(result, expected) - rng -= 1 - tm.assert_index_equal(rng, expected) - - def test_value_counts_unique(self): - # GH 7735 - idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') - - exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', - '2011-01-01 15:00', '2011-01-01 14:00', '2011-01-01 13:00', - '2011-01-01 12:00', '2011-01-01 11:00', '2011-01-01 10:00', - '2011-01-01 09:00'], freq='H') - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - - expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) - tm.assert_index_equal(idx.unique(), expected) - - idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') - - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') - expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') - expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) - - tm.assert_index_equal(idx.unique(), exp_idx) - - if __name__ == '__main__': import nose diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d07adeadb640c..03c73232f13bb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2,7 +2,10 @@ from datetime import datetime from pandas.compat import range, lrange, u +import os +import pickle import re +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -38,62 +41,51 @@ def test_constructor_unsortable(self): self.assertFalse(factor.ordered) def test_constructor(self): - # There are multiple ways to call a constructor - # old style: two arrays, one a pointer to the labels - # old style is now only available with compat=True exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - c_old = Categorical([0,1,2,0,1,2], levels=["a","b","c"], compat=True) - self.assert_numpy_array_equal(c_old.__array__(), exp_arr) - # the next one are from the old docs - with tm.assert_produces_warning(FutureWarning): - c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3], compat=True) - self.assert_numpy_array_equal(c_old2.__array__(), np.array([1, 2, 3, 1, 2, 3])) - with tm.assert_produces_warning(FutureWarning): - c_old3 = Categorical([0,1,2,0,1,2], ['a', 'b', 'c'], compat=True) - self.assert_numpy_array_equal(c_old3.__array__(), np.array(['a', 'b', 'c', 'a', 'b', 'c'])) - - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) - self.assert_numpy_array_equal(cat.__array__(), np.array([2,3])) - - with tm.assert_produces_warning(None): - cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) - self.assert_numpy_array_equal(cat.__array__(), np.array([1,2])) - - # new style c1 = Categorical(exp_arr) self.assert_numpy_array_equal(c1.__array__(), exp_arr) - c2 = Categorical(exp_arr, levels=["a","b","c"]) + c2 = Categorical(exp_arr, categories=["a","b","c"]) self.assert_numpy_array_equal(c2.__array__(), exp_arr) - c2 = Categorical(exp_arr, levels=["c","b","a"]) + c2 = Categorical(exp_arr, categories=["c","b","a"]) self.assert_numpy_array_equal(c2.__array__(), exp_arr) + # categories must be unique + def f(): + Categorical([1,2], [1,2,2]) + self.assertRaises(ValueError, f) + def f(): + Categorical(["a","b"], ["a","b","b"]) + self.assertRaises(ValueError, f) + def f(): + Categorical([1,2], [1,2,np.nan, np.nan]) + self.assertRaises(ValueError, f) + + # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) c2 = Categorical(c1) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) - c2 = Categorical(c1, levels=["a","b","c"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) + c2 = Categorical(c1, categories=["a","b","c"]) self.assert_numpy_array_equal(c1.__array__(), c2.__array__()) - self.assert_numpy_array_equal(c2.levels, np.array(["a","b","c"])) + self.assert_numpy_array_equal(c2.categories, np.array(["a","b","c"])) # Series of dtype category - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) c2 = Categorical(Series(c1)) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","c","b"]) c2 = Categorical(Series(c1)) self.assertTrue(c1.equals(c2)) @@ -102,17 +94,105 @@ def test_constructor(self): c2 = Categorical(Series(["a", "b", "c", "a"])) self.assertTrue(c1.equals(c2)) - c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) - c2 = Categorical(Series(["a", "b", "c", "a"]), levels=["a","b","c","d"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["a","b","c","d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a","b","c","d"]) self.assertTrue(c1.equals(c2)) - # This should result in integer levels, not float! - cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) - self.assertTrue(com.is_integer_dtype(cat.levels)) + # This should result in integer categories, not float! + cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3]) + self.assertTrue(com.is_integer_dtype(cat.categories)) + + # https://github.com/pydata/pandas/issues/3678 + cat = pd.Categorical([np.nan,1, 2, 3]) + self.assertTrue(com.is_integer_dtype(cat.categories)) + + # this should result in floats + cat = pd.Categorical([np.nan, 1, 2., 3 ]) + self.assertTrue(com.is_float_dtype(cat.categories)) + + cat = pd.Categorical([np.nan, 1., 2., 3. ]) + self.assertTrue(com.is_float_dtype(cat.categories)) + + # preserve int as far as possible by converting to object if NaN is in categories + cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) + self.assertTrue(com.is_object_dtype(cat.categories)) + # This doesn't work -> this would probably need some kind of "remember the original type" + # feature to try to cast the array interface result to... + #vals = np.asarray(cat[cat.notnull()]) + #self.assertTrue(com.is_integer_dtype(vals)) + cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"]) + self.assertTrue(com.is_object_dtype(cat.categories)) + # but don't do it for floats + cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) + self.assertTrue(com.is_float_dtype(cat.categories)) + + + # corner cases + cat = pd.Categorical([1]) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical(["a"]) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == "a") + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + # Scalars should be converted to lists + cat = pd.Categorical(1) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical([1], categories=1) + self.assertTrue(len(cat.categories) == 1) + self.assertTrue(cat.categories[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + # Catch old style constructor useage: two arrays, codes + categories + # We can only catch two cases: + # - when the first is an integer dtype and the second is not + # - when the resulting codes are all -1/NaN + with tm.assert_produces_warning(RuntimeWarning): + c_old = Categorical([0,1,2,0,1,2], categories=["a","b","c"]) + + with tm.assert_produces_warning(RuntimeWarning): + c_old = Categorical([0,1,2,0,1,2], categories=[3,4,5]) + + # the next one are from the old docs, but unfortunately these don't trigger :-( + with tm.assert_produces_warning(None): + c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) + cat = Categorical([1,2], categories=[1,2,3]) + + def test_constructor_with_generator(self): + # This was raising an Error in isnull(single_val).any() because isnull returned a scalar + # for a generator + from pandas.compat import range as xrange + + exp = Categorical([0,1,2]) + cat = Categorical((x for x in [0,1,2])) + self.assertTrue(cat.equals(exp)) + cat = Categorical(xrange(3)) + self.assertTrue(cat.equals(exp)) + + # This uses xrange internally + from pandas.core.index import MultiIndex + MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + # check that categories accept generators and sequences + cat = pd.Categorical([0,1,2], categories=(x for x in [0,1,2])) + self.assertTrue(cat.equals(exp)) + cat = pd.Categorical([0,1,2], categories=xrange(3)) + self.assertTrue(cat.equals(exp)) + def test_from_codes(self): - # too few levels + # too few categories def f(): Categorical.from_codes([1,2], [1,2]) self.assertRaises(ValueError, f) @@ -122,7 +202,7 @@ def f(): Categorical.from_codes(["a"], [1,2]) self.assertRaises(ValueError, f) - # no unique levels + # no unique categories def f(): Categorical.from_codes([0,1,2], ["a","a","b"]) self.assertRaises(ValueError, f) @@ -133,14 +213,14 @@ def f(): self.assertRaises(ValueError, f) - exp = Categorical(["a","b","c"]) + exp = Categorical(["a","b","c"], ordered=False) res = Categorical.from_codes([0,1,2], ["a","b","c"]) self.assertTrue(exp.equals(res)) # Not available in earlier numpy versions if hasattr(np.random, "choice"): codes = np.random.choice([0,1], 5, p=[0.9,0.1]) - pd.Categorical.from_codes(codes, levels=["train", "test"]) + pd.Categorical.from_codes(codes, categories=["train", "test"]) def test_comparisons(self): result = self.factor[self.factor == 'a'] @@ -178,19 +258,75 @@ def test_comparisons(self): expected = np.repeat(False, len(self.factor)) self.assert_numpy_array_equal(result, expected) - def test_na_flags_int_levels(self): + # comparisons with categoricals + cat_rev = pd.Categorical(["a","b","c"], categories=["c","b","a"]) + cat_rev_base = pd.Categorical(["b","b","b"], categories=["c","b","a"]) + cat = pd.Categorical(["a","b","c"]) + cat_base = pd.Categorical(["b","b","b"], categories=cat.categories) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + self.assert_numpy_array_equal(res, exp) + + # Only categories with same categories can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + cat_rev_base2 = pd.Categorical(["b","b","b"], categories=["c","b","a","d"]) + def f(): + cat_rev > cat_rev_base2 + self.assertRaises(TypeError, f) + + # Only categories with same ordering information can be compared + cat_unorderd = cat.copy() + cat_unorderd.ordered = False + self.assertFalse((cat > cat).any()) + def f(): + cat > cat_unorderd + self.assertRaises(TypeError, f) + + # comparison (in both directions) with Series will raise + s = Series(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + # comparison with numpy.array will raise in both direction, but only on newer + # numpy versions + a = np.array(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + # The following work via '__array_priority__ = 1000' + # works only on numpy >= 1.7.1 and not on PY3.2 + if LooseVersion(np.__version__) > "1.7.1" and not compat.PY3_2: + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + + def test_na_flags_int_categories(self): # #1457 - levels = lrange(10) + categories = lrange(10) labels = np.random.randint(0, 10, 20) labels[::5] = -1 - cat = Categorical(labels, levels, fastpath=True) + cat = Categorical(labels, categories, fastpath=True) repr(cat) self.assert_numpy_array_equal(com.isnull(cat), labels == -1) - def test_levels_none(self): + def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) self.assertTrue(factor.equals(self.factor)) @@ -200,55 +336,104 @@ def test_describe(self): desc = self.factor.describe() expected = DataFrame.from_dict(dict(counts=[3, 2, 3], freqs=[3/8., 2/8., 3/8.], - levels=['a', 'b', 'c']) - ).set_index('levels') + categories=['a', 'b', 'c']) + ).set_index('categories') + tm.assert_frame_equal(desc, expected) + + # check unused categories + cat = self.factor.copy() + cat.set_categories(["a","b","c","d"], inplace=True) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan], + freqs=[3/8., 2/8., 3/8., np.nan], + categories=['a', 'b', 'c', 'd']) + ).set_index('categories') tm.assert_frame_equal(desc, expected) # check an integer one desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() expected = DataFrame.from_dict(dict(counts=[5, 3, 3], freqs=[5/11., 3/11., 3/11.], - levels=[1,2,3] + categories=[1,2,3] ) - ).set_index('levels') + ).set_index('categories') tm.assert_frame_equal(desc, expected) + # https://github.com/pydata/pandas/issues/3678 + # describe should work with NaN + cat = pd.Categorical([np.nan,1, 2, 2]) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, 1], + freqs=[1/4., 2/4., 1/4.], + categories=[1,2,np.nan] + ) + ).set_index('categories') + tm.assert_frame_equal(desc, expected) + + # having NaN as category and as "not available" should also print two NaNs in describe! + cat = pd.Categorical([np.nan,1, 2, 2]) + cat.set_categories([1,2,np.nan], rename=True, inplace=True) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1], + freqs=[1/4., 2/4., np.nan, 1/4.], + categories=[1,2,np.nan,np.nan] + ) + ).set_index('categories') + tm.assert_frame_equal(desc, expected) + + # empty categories show up as NA + cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True) + result = cat.describe() + + expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], + columns=['counts','freqs'], + index=Index(['a','b','c'],name='categories')) + tm.assert_frame_equal(result,expected) + + # NA as a category + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan] ) + result = cat.describe() + + expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], + columns=['counts','freqs'], + index=Index(['b','a','c',np.nan],name='categories')) + tm.assert_frame_equal(result,expected) + + def test_print(self): - expected = [" a", " b", " b", " a", " a", " c", " c", " c", - "Levels (3, object): [a < b < c]"] + expected = ["[a, b, b, a, a, c, c, c]", + "Categories (3, object): [a < b < c]"] expected = "\n".join(expected) actual = repr(self.factor) self.assertEqual(actual, expected) def test_big_print(self): factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True) - expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", - " a", " b", " c", " a", "...", " c", " a", " b", " c", - " a", " b", " c", " a", " b", " c", " a", " b", " c", + expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Name: cat, Length: 600", - "Levels (3, object): [a, b, c]"] + "Categories (3, object): [a, b, c]"] expected = "\n".join(expected) actual = repr(factor) - self.assertEqual(expected, actual) + self.assertEqual(actual, expected) def test_empty_print(self): factor = Categorical([], ["a","b","c"], name="cat") - expected = ("Categorical([], Name: cat, Levels (3, object): [a < b < c]") + expected = ("[], Name: cat, Categories (3, object): [a < b < c]") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) self.assertEqual(actual, expected) factor = Categorical([], ["a","b","c"]) - expected = ("Categorical([], Levels (3, object): [a < b < c]") + expected = ("[], Categories (3, object): [a < b < c]") actual = repr(factor) self.assertEqual(expected, actual) factor = Categorical([], []) - expected = ("Categorical([], Levels (0, object): []") + expected = ("[], Categories (0, object): []") self.assertEqual(expected, repr(factor)) def test_periodindex(self): @@ -260,7 +445,7 @@ def test_periodindex(self): exp_arr = np.array([0, 0, 1, 1, 2, 2],dtype='int64') exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1._codes, exp_arr) - self.assertTrue(cat1.levels.equals(exp_idx)) + self.assertTrue(cat1.categories.equals(exp_idx)) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -269,7 +454,7 @@ def test_periodindex(self): exp_arr = np.array([2, 2, 1, 0, 2, 0],dtype='int64') exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat2._codes, exp_arr) - self.assertTrue(cat2.levels.equals(exp_idx2)) + self.assertTrue(cat2.categories.equals(exp_idx2)) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') @@ -278,99 +463,255 @@ def test_periodindex(self): exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3._codes, exp_arr) - self.assertTrue(cat3.levels.equals(exp_idx)) + self.assertTrue(cat3.categories.equals(exp_idx)) - def test_level_assigments(self): + def test_categories_assigments(self): s = pd.Categorical(["a","b","c","a"]) exp = np.array([1,2,3,1]) - s.levels = [1,2,3] + s.categories = [1,2,3] self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.levels, np.array([1,2,3])) + self.assert_numpy_array_equal(s.categories, np.array([1,2,3])) # lengthen - s.levels = [1,2,3,4] - # does nothing to the values but only the the levels - self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.levels, np.array([1,2,3,4])) + def f(): + s.categories = [1,2,3,4] + self.assertRaises(ValueError, f) # shorten - exp2 = np.array([1,2,np.nan,1]) - s.levels = [1,2] - self.assert_numpy_array_equivalent(s.__array__(), exp2) # doesn't work with nan :-( - self.assertTrue(np.isnan(s.__array__()[2])) - self.assert_numpy_array_equal(s.levels, np.array([1,2])) + def f(): + s.categories = [1,2] + self.assertRaises(ValueError, f) - def test_reorder_levels(self): + def test_set_categories(self): cat = Categorical(["a","b","c","a"], ordered=True) - exp_levels = np.array(["c","b","a"]) + exp_categories = np.array(["c","b","a"]) exp_values = np.array(["a","b","c","a"]) - cat.reorder_levels(["c","b","a"]) - self.assert_numpy_array_equal(cat.levels, exp_levels) + + res = cat.set_categories(["c","b","a"], inplace=True) + self.assert_numpy_array_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) + self.assertIsNone(res) + + res = cat.set_categories(["a","b","c"]) + # cat must be the same as before + self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_numpy_array_equal(cat.__array__(), exp_values) + # only res is changed + exp_categories_back = np.array(["a","b","c"]) + self.assert_numpy_array_equal(res.categories, exp_categories_back) + self.assert_numpy_array_equal(res.__array__(), exp_values) + + # not all "old" included in "new" -> all not included ones are now np.nan + cat = Categorical(["a","b","c","a"], ordered=True) + res = cat.set_categories(["a"]) + self.assert_numpy_array_equal(res.codes, np.array([0,-1,-1,0])) + + # still not all "old" in "new" + res = cat.set_categories(["a","b","d"]) + self.assert_numpy_array_equal(res.codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(res.categories, np.array(["a","b","d"])) + + # all "old" included in "new" + cat = cat.set_categories(["a","b","c","d"]) + exp_categories = np.array(["a","b","c","d"]) + self.assert_numpy_array_equal(cat.categories, exp_categories) + + # internals... + c = Categorical([1,2,3,4,1], categories=[1,2,3,4]) + self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) + self.assert_numpy_array_equal(c.categories , np.array([1,2,3,4] )) + self.assert_numpy_array_equal(c.get_values(), np.array([1,2,3,4,1] )) + c = c.set_categories([4,3,2,1]) # all "pointers" to '4' must be changed from 3 to 0,... + self.assert_numpy_array_equal(c._codes, np.array([3,2,1,0,3])) # positions are changed + self.assert_numpy_array_equal(c.categories, np.array([4,3,2,1])) # categories are now in new order + self.assert_numpy_array_equal(c.get_values(), np.array([1,2,3,4,1])) # output is the same + self.assertTrue(c.min(), 4) + self.assertTrue(c.max(), 1) + + def test_rename_categories(self): + cat = pd.Categorical(["a","b","c","a"]) + + # inplace=False: the old one must not be changed + res = cat.rename_categories([1,2,3]) + self.assert_numpy_array_equal(res.__array__(), np.array([1,2,3,1])) + self.assert_numpy_array_equal(res.categories, np.array([1,2,3])) + self.assert_numpy_array_equal(cat.__array__(), np.array(["a","b","c","a"])) + self.assert_numpy_array_equal(cat.categories, np.array(["a","b","c"])) + res = cat.rename_categories([1,2,3], inplace=True) + + # and now inplace + self.assertIsNone(res) + self.assert_numpy_array_equal(cat.__array__(), np.array([1,2,3,1])) + self.assert_numpy_array_equal(cat.categories, np.array([1,2,3])) + + # lengthen + def f(): + cat.rename_categories([1,2,3,4]) + self.assertRaises(ValueError, f) + # shorten + def f(): + cat.rename_categories([1,2]) + self.assertRaises(ValueError, f) + + def test_reorder_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b","c","a"], categories=["c","b","a"], ordered=True) + + # first inplace == False + res = cat.reorder_categories(["c","b","a"]) + # cat must be the same as before + self.assert_categorical_equal(cat, old) + # only res is changed + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.reorder_categories(["c","b","a"], inplace=True) + self.assertIsNone(res) + self.assert_categorical_equal(cat, new) # not all "old" included in "new" + cat = Categorical(["a","b","c","a"], ordered=True) def f(): - cat.reorder_levels(["a"]) + cat.reorder_categories(["a"]) self.assertRaises(ValueError, f) # still not all "old" in "new" def f(): - cat.reorder_levels(["a","b","d"]) + cat.reorder_categories(["a","b","d"]) self.assertRaises(ValueError, f) - # This works: all "old" included in "new" - cat.reorder_levels(["a","b","c","d"]) - exp_levels = np.array(["a","b","c","d"]) - self.assert_numpy_array_equal(cat.levels, exp_levels) + # all "old" included in "new", but too long + def f(): + cat.reorder_categories(["a","b","c","d"]) + self.assertRaises(ValueError, f) - # internals... - c = Categorical([1,2,3,4,1], levels=[1,2,3,4]) - self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) - self.assert_numpy_array_equal(c.levels , np.array([1,2,3,4] )) - self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1] )) - c.reorder_levels([4,3,2,1]) # all "pointers" to '4' must be changed from 3 to 0,... - self.assert_numpy_array_equal(c._codes , np.array([3,2,1,0,3])) # positions are changed - self.assert_numpy_array_equal(c.levels , np.array([4,3,2,1])) # levels are now in new order - self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1])) # output is the same - self.assertTrue(c.min(), 4) - self.assertTrue(c.max(), 1) + def test_add_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b","c","a"], categories=["a","b","c","d"], ordered=True) + + # first inplace == False + res = cat.add_categories("d") + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + res = cat.add_categories(["d"]) + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.add_categories("d", inplace=True) + self.assert_categorical_equal(cat, new) + self.assertIsNone(res) + # new is in old categories def f(): - c.reorder_levels([4,3,2,10]) + cat.add_categories(["d"]) self.assertRaises(ValueError, f) - def test_remove_unused_levels(self): - c = Categorical(["a","b","c","d","a"], levels=["a","b","c","d","e"]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d","e"])) - c.remove_unused_levels() - self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d"])) + def test_remove_categories(self): + cat = Categorical(["a","b","c","a"], ordered=True) + old = cat.copy() + new = Categorical(["a","b",np.nan,"a"], categories=["a","b"], ordered=True) + + # first inplace == False + res = cat.remove_categories("c") + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + res = cat.remove_categories(["c"]) + self.assert_categorical_equal(cat, old) + self.assert_categorical_equal(res, new) + + # inplace == True + res = cat.remove_categories("c", inplace=True) + self.assert_categorical_equal(cat, new) + self.assertIsNone(res) + + # removal is not in categories + def f(): + cat.remove_categories(["c"]) + self.assertRaises(ValueError, f) + + def test_remove_unused_categories(self): + c = Categorical(["a","b","c","d","a"], categories=["a","b","c","d","e"]) + exp_categories_all = np.array(["a","b","c","d","e"]) + exp_categories_dropped = np.array(["a","b","c","d"]) + + self.assert_numpy_array_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories() + self.assert_numpy_array_equal(res.categories, exp_categories_dropped) + self.assert_numpy_array_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories(inplace=True) + self.assert_numpy_array_equal(c.categories, exp_categories_dropped) + self.assertIsNone(res) + def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a","b",np.nan,"a"]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b"])) + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) + self.assert_numpy_array_equal(c._codes , np.array([0,-1,-1,0])) - # If levels have nan included, the code should point to that instead - c = Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan]) - self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + # If categories have nan included, the code should point to that instead + c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]) + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,2,2,0])) - # Changing levels should also make the replaced level np.nan + # Changing categories should also make the replaced category np.nan c = Categorical(["a","b","c","a"]) - c.levels = ["a","b",np.nan] - self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + c.categories = ["a","b",np.nan] + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + # Adding nan to categories should make assigned nan point to the category! + c = Categorical(["a","b",np.nan,"a"]) + self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) + self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c.set_categories(["a","b",np.nan], rename=True, inplace=True) + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + c[1] = np.nan + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,2,-1,0])) + + + def test_isnull(self): + exp = np.array([False, False, True]) + c = Categorical(["a","b",np.nan]) + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + + c = Categorical(["a","b",np.nan], categories=["a","b",np.nan]) + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + + # test both nan in categories and as -1 + exp = np.array([True, False, True]) + c = Categorical(["a","b",np.nan]) + c.set_categories(["a","b",np.nan], rename=True, inplace=True) + c[0] = np.nan + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + def test_codes_immutable(self): # Codes should be read only c = Categorical(["a","b","c","a", np.nan]) - exp = np.array([0,1,2,0, -1]) + exp = np.array([0,1,2,0,-1],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise def f(): - c.codes = np.array([0,1,2,0,1]) + c.codes = np.array([0,1,2,0,1],dtype='int8') self.assertRaises(ValueError, f) # changes in the codes array should raise @@ -382,10 +723,10 @@ def f(): # But even after getting the codes, the original array should still be writeable! c[4] = "a" - exp = np.array([0,1,2,0, 0]) + exp = np.array([0,1,2,0,0],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) c._codes[4] = 2 - exp = np.array([0,1,2,0, 2]) + exp = np.array([0,1,2,0, 2],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) @@ -400,12 +741,12 @@ def test_min_max(self): _max = cat.max() self.assertEqual(_min, "a") self.assertEqual(_max, "d") - cat = Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True) + cat = Categorical(["a","b","c","d"], categories=['d','c','b','a'], ordered=True) _min = cat.min() _max = cat.max() self.assertEqual(_min, "d") self.assertEqual(_max, "a") - cat = Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True) + cat = Categorical([np.nan,"b","c",np.nan], categories=['d','c','b','a'], ordered=True) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) @@ -416,7 +757,7 @@ def test_min_max(self): _max = cat.max(numeric_only=True) self.assertEqual(_max, "b") - cat = Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True) + cat = Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) @@ -427,32 +768,38 @@ def test_min_max(self): _max = cat.max(numeric_only=True) self.assertEqual(_max, 1) + def test_unique(self): + cat = Categorical(["a","b","c","d"]) + exp = np.asarray(["a","b","c","d"]) + res = cat.unique() + self.assert_numpy_array_equal(res, exp) + self.assertEqual(type(res), type(exp)) def test_mode(self): - s = Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([5], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([5], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,1,1,4,5,5,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([5,1], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([5,1], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([1,2,3,4,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) # NaN should not become the mode! - s = Categorical([np.nan,np.nan,np.nan,4,5], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,np.nan,4,5], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([np.nan,np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,np.nan,4,5,4], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([4], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) - s = Categorical([np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + s = Categorical([np.nan,np.nan,4,5,4], categories=[5,4,3,2,1], ordered=True) res = s.mode() - exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + exp = Categorical([4], categories=[5,4,3,2,1], ordered=True) self.assertTrue(res.equals(exp)) @@ -468,7 +815,7 @@ def test_sort(self): exp = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) - cat = Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True) + cat = Categorical(["a","c","b","d"], categories=["a","b","c","d"], ordered=True) res = cat.order() exp = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) @@ -488,9 +835,83 @@ def test_slicing_directly(self): sliced = cat[3] tm.assert_equal(sliced, "d") sliced = cat[3:5] - expected = Categorical(["d","a"], levels=['a', 'b', 'c', 'd']) + expected = Categorical(["d","a"], categories=['a', 'b', 'c', 'd']) self.assert_numpy_array_equal(sliced._codes, expected._codes) - tm.assert_index_equal(sliced.levels, expected.levels) + tm.assert_index_equal(sliced.categories, expected.categories) + + def test_set_item_nan(self): + cat = pd.Categorical([1,2,3]) + exp = pd.Categorical([1,np.nan,3], categories=[1,2,3]) + cat[1] = np.nan + self.assertTrue(cat.equals(exp)) + + # if nan in categories, the proper code should be set! + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + cat[1] = np.nan + exp = np.array([0,3,2,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + cat[1:3] = np.nan + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + cat[1:3] = [np.nan, 1] + exp = np.array([0,3,0,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + cat[1:3] = [np.nan, np.nan] + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2, np.nan, 3], categories=[1,2,3]) + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + cat[pd.isnull(cat)] = np.nan + exp = np.array([0,1,3,2]) + self.assert_numpy_array_equal(cat.codes, exp) + + def test_nbytes(self): + cat = pd.Categorical([1,2,3]) + exp = cat._codes.nbytes + cat._categories.values.nbytes + self.assertEqual(cat.nbytes, exp) + + def test_searchsorted(self): + + # See https://github.com/pydata/pandas/issues/8420 + # TODO: implement me... + cat = pd.Categorical([1,2,3]) + def f(): + cat.searchsorted(3) + self.assertRaises(NotImplementedError, f) + + def test_deprecated_labels(self): + # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + exp = cat.codes + with tm.assert_produces_warning(FutureWarning): + res = cat.labels + self.assert_numpy_array_equal(res, exp) + self.assertFalse(LooseVersion(pd.__version__) >= '0.18') + + def test_deprecated_levels(self): + # TODO: levels is deprecated and should be removed in 0.18 or 2017, whatever is earlier + cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) + exp = cat.categories + with tm.assert_produces_warning(FutureWarning): + res = cat.levels + self.assert_numpy_array_equal(res, exp) + with tm.assert_produces_warning(FutureWarning): + res = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + self.assert_numpy_array_equal(res.categories, exp) + + self.assertFalse(LooseVersion(pd.__version__) >= '0.18') + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -527,6 +948,47 @@ def f(): self.assertFalse(dtype == np.str_) self.assertFalse(np.str_ == dtype) + # GH8143 + index = ['cat','obj','num'] + cat = pd.Categorical(['a', 'b', 'c']) + obj = pd.Series(['a', 'b', 'c']) + num = pd.Series([1, 2, 3]) + df = pd.concat([pd.Series(cat), obj, num], axis=1, keys=index) + + result = df.dtypes == 'object' + expected = Series([False,True,False],index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == 'int64' + expected = Series([False,False,True],index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == 'category' + expected = Series([True,False,False],index=index) + tm.assert_series_equal(result, expected) + + def test_codes_dtypes(self): + + # GH 8453 + result = Categorical(['foo','bar','baz']) + self.assertTrue(result.codes.dtype == 'int8') + + result = Categorical(['foo%05d' % i for i in range(400) ]) + self.assertTrue(result.codes.dtype == 'int16') + + result = Categorical(['foo%05d' % i for i in range(40000) ]) + self.assertTrue(result.codes.dtype == 'int32') + + # adding cats + result = Categorical(['foo','bar','baz']) + self.assertTrue(result.codes.dtype == 'int8') + result = result.add_categories(['foo%05d' % i for i in range(400) ]) + self.assertTrue(result.codes.dtype == 'int16') + + # removing cats + result = result.remove_categories(['foo%05d' % i for i in range(300) ]) + self.assertTrue(result.codes.dtype == 'int8') + def test_basic(self): # test basic creation / coercion of categoricals @@ -582,21 +1044,66 @@ def test_creation_astype(self): df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) - df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]}) cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) + def test_construction_series(self): + + l = [1,2,3,1] + exp = Series(l).astype('category') + res = Series(l,dtype='category') + tm.assert_series_equal(res, exp) + + l = ["a","b","c","a"] + exp = Series(l).astype('category') + res = Series(l,dtype='category') + tm.assert_series_equal(res, exp) + + # insert into frame with different index + # GH 8076 + index = pd.date_range('20000101', periods=3) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],categories=['a', 'b', 'c'])) + expected.index = index + + expected = DataFrame({'x': expected}) + df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index) + tm.assert_frame_equal(df, expected) + + def test_reindex(self): + + index = pd.date_range('20000101', periods=3) + + # reindexing to an invalid Categorical + s = Series(['a', 'b', 'c'],dtype='category') + result = s.reindex(index) + expected = Series(Categorical(values=[np.nan,np.nan,np.nan],categories=['a', 'b', 'c'])) + expected.index = index + tm.assert_series_equal(result, expected) + + # partial reindexing + expected = Series(Categorical(values=['b','c'],categories=['a', 'b', 'c'])) + expected.index = [1,2] + result = s.reindex([1,2]) + tm.assert_series_equal(result, expected) + + expected = Series(Categorical(values=['c',np.nan],categories=['a', 'b', 'c'])) + expected.index = [2,3] + result = s.reindex([2,3]) + tm.assert_series_equal(result, expected) + + + def test_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either the series or the - # categorical should not change the values in the other one! + # categorical should not change the values in the other one, IF you specify copy! cat = Categorical(["a","b","c","a"]) s = pd.Series(cat, copy=True) self.assertFalse(s.cat is cat) - s.cat.levels = [1,2,3] + s.cat.categories = [1,2,3] exp_s = np.array([1,2,3,1]) exp_cat = np.array(["a","b","c","a"]) self.assert_numpy_array_equal(s.__array__(), exp_s) @@ -612,8 +1119,8 @@ def test_sideeffects_free(self): # so this WILL change values cat = Categorical(["a","b","c","a"]) s = pd.Series(cat) - self.assertTrue(s.cat is cat) - s.cat.levels = [1,2,3] + self.assertTrue(s.values is cat) + s.cat.categories = [1,2,3] exp_s = np.array([1,2,3,1]) self.assert_numpy_array_equal(s.__array__(), exp_s) self.assert_numpy_array_equal(cat.__array__(), exp_s) @@ -627,21 +1134,35 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a","b",np.nan,"a"])) - self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) - self.assert_numpy_array_equal(s.cat._codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(s.cat.categories, np.array(["a","b"])) + self.assert_numpy_array_equal(s.values.codes, np.array([0,1,-1,0])) - # If levels have nan included, the label should point to that instead - s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) - self.assert_numpy_array_equal(s2.cat.levels, + # If categories have nan included, the label should point to that instead + s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])) + self.assert_numpy_array_equal(s2.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s2.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s2.values.codes, np.array([0,1,2,0])) - # Changing levels should also make the replaced level np.nan + # Changing categories should also make the replaced category np.nan s3 = Series(Categorical(["a","b","c","a"])) - s3.cat.levels = ["a","b",np.nan] - self.assert_numpy_array_equal(s3.cat.levels, + s3.cat.categories = ["a","b",np.nan] + self.assert_numpy_array_equal(s3.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s3.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s3.values.codes, np.array([0,1,2,0])) + + def test_cat_accessor(self): + s = Series(Categorical(["a","b",np.nan,"a"])) + self.assert_numpy_array_equal(s.cat.categories, np.array(["a","b"])) + self.assertEqual(s.cat.ordered, True) + exp = Categorical(["a","b",np.nan,"a"], categories=["b","a"]) + s.cat.set_categories(["b", "a"], inplace=True) + self.assertTrue(s.values.equals(exp)) + res = s.cat.set_categories(["b", "a"]) + self.assertTrue(res.values.equals(exp)) + exp = Categorical(["a","b",np.nan,"a"], categories=["b","a"]) + s[:] = "a" + s = s.cat.remove_unused_categories() + self.assert_numpy_array_equal(s.cat.categories, np.array(["a"])) def test_sequence_like(self): @@ -651,8 +1172,8 @@ def test_sequence_like(self): df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing - result = list(df.grade.cat) - expected = np.array(df.grade.cat).tolist() + result = list(df.grade.values) + expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result,expected) # iteration @@ -676,41 +1197,45 @@ def test_series_delegations(self): self.assertRaises(TypeError, lambda : Series(np.arange(5.)).cat) self.assertRaises(TypeError, lambda : Series([Timestamp('20130101')]).cat) - # Series should delegate calls to '.level', '.ordered' and '.reorder()' to the categorical + # Series should delegate calls to '.categories', '.codes', '.ordered' and the + # methods '.set_categories()' 'drop_unused_categories()' to the categorical s = Series(Categorical(["a","b","c","a"], ordered=True)) - exp_levels = np.array(["a","b","c"]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) + exp_categories = np.array(["a","b","c"]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + s.cat.categories = [1,2,3] + exp_categories = np.array([1,2,3]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + + exp_codes = Series([0,1,2,0],dtype='int8') + tm.assert_series_equal(s.cat.codes, exp_codes) - s.cat.levels = [1,2,3] - exp_levels = np.array([1,2,3]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) self.assertEqual(s.cat.ordered, True) s.cat.ordered = False self.assertEqual(s.cat.ordered, False) # reorder s = Series(Categorical(["a","b","c","a"], ordered=True)) - exp_levels = np.array(["c","b","a"]) + exp_categories = np.array(["c","b","a"]) exp_values = np.array(["a","b","c","a"]) - s.cat.reorder_levels(["c","b","a"]) - self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + s = s.cat.set_categories(["c","b","a"]) + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) - # remove unused levels - s = Series(Categorical(["a","b","b","a"], levels=["a","b","c"])) - exp_levels = np.array(["a","b"]) + # remove unused categories + s = Series(Categorical(["a","b","b","a"], categories=["a","b","c"])) + exp_categories = np.array(["a","b"]) exp_values = np.array(["a","b","b","a"]) - s.cat.remove_unused_levels() - self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + s = s.cat.remove_unused_categories() + self.assert_numpy_array_equal(s.cat.categories, exp_categories) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error on wrong inputs: def f(): - s.reorder_levels([4,3,2,1]) + s.set_categories([4,3,2,1]) self.assertRaises(Exception, f) - # right: s.cat.reorder_levels([4,3,2,1]) + # right: s.cat.set_categories([4,3,2,1]) def test_series_functions_no_warnings(self): df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) @@ -724,8 +1249,8 @@ def test_assignment_to_dataframe(self): labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] df = df.sort(columns=['value'], ascending=True) - d = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - s = Series(d) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values df['D'] = d str(df) @@ -747,13 +1272,9 @@ def test_assignment_to_dataframe(self): # sorting s.name = 'E' - self.assertTrue(result2.sort_index().equals(s)) - - # FIXME? - #### what does this compare to? ### - result = df.sort_index() + self.assertTrue(result2.sort_index().equals(s.sort_index())) - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + cat = pd.Categorical([1,2,3,10], categories=[1,2,3,4,10]) df = pd.DataFrame(pd.Series(cat)) def test_describe(self): @@ -762,31 +1283,16 @@ def test_describe(self): result = self.cat.describe() self.assertEquals(len(result.columns),1) - # empty levels show up as NA - s = Series(Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True)) - result = s.cat.describe() - expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], - columns=['counts','freqs'], - index=Index(['a','b','c'],name='levels')) - tm.assert_frame_equal(result,expected) + # In a frame, describe() for the cat should be the same as for string arrays (count, unique, + # top, freq) + cat = Categorical(["a","b","b","b"], categories=['a','b','c'], ordered=True) + s = Series(cat) result = s.describe() expected = Series([4,2,"b",3],index=['count','unique','top', 'freq']) tm.assert_series_equal(result,expected) - # NA as a level - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) - result = cat.describe() - - expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], - columns=['counts','freqs'], - index=Index(['b','a','c',np.nan],name='levels')) - tm.assert_frame_equal(result,expected) - - - # In a frame, describe() for the cat should be the same as for string arrays (count, unique, - # top, freq) cat = pd.Series(pd.Categorical(["a","b","c","c"])) df3 = pd.DataFrame({"cat":cat, "s":["a","b","c","c"]}) res = df3.describe() @@ -795,7 +1301,7 @@ def test_describe(self): def test_repr(self): a = pd.Series(pd.Categorical([1,2,3,4], name="a")) exp = u("0 1\n1 2\n2 3\n3 4\n" + - "Name: a, dtype: category\nLevels (4, int64): [1 < 2 < 3 < 4]") + "Name: a, dtype: category\nCategories (4, int64): [1 < 2 < 3 < 4]") self.assertEqual(exp, a.__unicode__()) @@ -803,17 +1309,29 @@ def test_repr(self): exp = u("".join(["%s a\n%s b\n"%(i,i+1) for i in range(0,10,2)]) + "...\n" + "".join(["%s a\n%s b\n"%(i,i+1) for i in range(40,50,2)]) + "Name: a, Length: 50, dtype: category\n" + - "Levels (2, object): [a < b]") + "Categories (2, object): [a < b]") self.assertEqual(exp,a._tidy_repr()) levs = list("abcdefghijklmnopqrstuvwxyz") - a = pd.Series(pd.Categorical(["a","b"], name="a", levels=levs)) + a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs)) exp = u("0 a\n1 b\n" + "Name: a, dtype: category\n" - "Levels (26, object): [a < b < c < d ... w < x < y < z]") + "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) + def test_info(self): + + # make sure it works + n = 2500 + df = DataFrame({ 'int64' : np.random.randint(100,size=n) }) + df['category'] = Series(np.array(list('abcdefghij')).take(np.random.randint(0,10,size=n))).astype('category') + df.isnull() + df.info() + + df2 = df[df['category']=='d'] + df2.info() + def test_groupby_sort(self): # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby @@ -837,41 +1355,41 @@ def test_min_max(self): self.assertEqual(_min, "a") self.assertEqual(_max, "d") - cat = Series(Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True)) + cat = Series(Categorical(["a","b","c","d"], categories=['d','c','b','a'], ordered=True)) _min = cat.min() _max = cat.max() self.assertEqual(_min, "d") self.assertEqual(_max, "a") - cat = Series(Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True)) + cat = Series(Categorical([np.nan,"b","c",np.nan], categories=['d','c','b','a'], ordered=True)) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) self.assertEqual(_max, "b") - cat = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + cat = Series(Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True)) _min = cat.min() _max = cat.max() self.assertTrue(np.isnan(_min)) self.assertEqual(_max, 1) def test_mode(self): - s = Series(Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([5], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([5], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) - s = Series(Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,1,1,4,5,5,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([5,1], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([5,1], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) - s = Series(Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([1,2,3,4,5], categories=[5,4,3,2,1], ordered=True)) res = s.mode() - exp = Series(Categorical([], levels=[5,4,3,2,1], ordered=True)) + exp = Series(Categorical([], categories=[5,4,3,2,1], ordered=True)) tm.assert_series_equal(res, exp) def test_value_counts(self): - s = pd.Series(pd.Categorical(["a","b","c","c","c","b"], levels=["c","a","b","d"])) + s = pd.Series(pd.Categorical(["a","b","c","c","c","b"], categories=["c","a","b","d"])) res = s.value_counts(sort=False) exp = Series([3,1,2,0], index=["c","a","b","d"]) tm.assert_series_equal(res, exp) @@ -881,15 +1399,15 @@ def test_value_counts(self): def test_groupby(self): - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"]) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) }) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) # single grouper @@ -921,8 +1439,8 @@ def test_groupby(self): def test_pivot_table(self): - raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) - raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], categories=["c","d","y"]) df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) @@ -933,7 +1451,7 @@ def test_pivot_table(self): def test_count(self): - s = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + s = Series(Categorical([np.nan,1,2,np.nan], categories=[5,4,3,2,1], ordered=True)) result = s.count() self.assertEqual(result, 2) @@ -949,7 +1467,7 @@ def test_sort(self): exp = np.array(["a","b","c","d"]) self.assert_numpy_array_equal(res.__array__(), exp) - cat = Series(Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True)) + cat = Series(Categorical(["a","c","b","d"], categories=["a","b","c","d"], ordered=True)) res = cat.order() exp = np.array(["a","b","c","d"]) self.assert_numpy_array_equal(res.__array__(), exp) @@ -958,15 +1476,15 @@ def test_sort(self): exp = np.array(["d","c","b","a"]) self.assert_numpy_array_equal(res.__array__(), exp) - raw_cat1 = Categorical(["a","b","c","d"], levels=["a","b","c","d"], ordered=False) - raw_cat2 = Categorical(["a","b","c","d"], levels=["d","c","b","a"]) + raw_cat1 = Categorical(["a","b","c","d"], categories=["a","b","c","d"], ordered=False) + raw_cat2 = Categorical(["a","b","c","d"], categories=["d","c","b","a"]) s = ["a","b","c","d"] df = DataFrame({"unsort":raw_cat1,"sort":raw_cat2, "string":s, "values":[1,2,3,4]}) # Cats must be sorted in a dataframe res = df.sort(columns=["string"], ascending=False) exp = np.array(["d", "c", "b", "a"]) - self.assert_numpy_array_equal(res["sort"].cat.__array__(), exp) + self.assert_numpy_array_equal(res["sort"].values.__array__(), exp) self.assertEqual(res["sort"].dtype, "category") res = df.sort(columns=["sort"], ascending=False) @@ -983,9 +1501,9 @@ def f(): # GH 7848 df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df["grade"] = pd.Categorical(df["raw_grade"]) - df['grade'].cat.reorder_levels(['b', 'e', 'a']) + df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a']) - # sorts 'grade' according to the order of the levels + # sorts 'grade' according to the order of the categories result = df.sort(columns=['grade']) expected = df.iloc[[1,2,5,0,3,4]] tm.assert_frame_equal(result,expected) @@ -999,27 +1517,39 @@ def f(): cat = Categorical(["a","c","c","b","d"], ordered=True) res = cat.order(ascending=False) exp_val = np.array(["d","c", "c", "b","a"],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.categories, exp_categories) # some NaN positions cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a", np.nan],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + exp_categories = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) - exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + exp_categories = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='last') + exp_val = np.array(["d","c","b","a",np.nan],dtype=object) + exp_categories = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) @@ -1053,13 +1583,13 @@ def test_slicing_and_getting_ops(self): # - returning a row # - returning a single value - cats = pd.Categorical(["a","c","b","c","c","c","c"], levels=["a","b","c"]) + cats = pd.Categorical(["a","c","b","c","c","c","c"], categories=["a","b","c"]) idx = pd.Index(["h","i","j","k","l","m","n"]) values= [1,2,3,4,5,6,7] df = pd.DataFrame({"cats":cats,"values":values}, index=idx) # the expected values - cats2 = pd.Categorical(["b","c"], levels=["a","b","c"]) + cats2 = pd.Categorical(["b","c"], categories=["a","b","c"]) idx2 = pd.Index(["j","k"]) values2= [3,4] @@ -1184,13 +1714,13 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): #GH 7918 - cats = Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) + cats = Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c"]) idx = Index(["h","i","j","k","l","m","n",]) values= [1,2,2,2,3,4,5] df = DataFrame({"cats":cats,"values":values}, index=idx) result = df.iloc[2:4,:] - expected = DataFrame({"cats":Categorical(['b','b'],levels=['a','b','c']),"values":[2,2]}, index=['j','k']) + expected = DataFrame({"cats":Categorical(['b','b'],categories=['a','b','c']),"values":[2,2]}, index=['j','k']) tm.assert_frame_equal(result, expected) result = df.iloc[2:4,:].dtypes @@ -1198,50 +1728,50 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.loc["h":"j","cats"] - expected = Series(Categorical(['a','b','b'],levels=['a','b','c']),index=['h','i','j']) + expected = Series(Categorical(['a','b','b'],categories=['a','b','c']),index=['h','i','j']) tm.assert_series_equal(result, expected) result = df.ix["h":"j",0:1] - expected = DataFrame({'cats' : Series(Categorical(['a','b','b'],levels=['a','b','c']),index=['h','i','j']) }) + expected = DataFrame({'cats' : Series(Categorical(['a','b','b'],categories=['a','b','c']),index=['h','i','j']) }) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): # systematically test the assigning operations: # for all slicing ops: - # for value in levels and value not in levels: + # for value in categories and value not in categories: # - assign a single value -> exp_single_cats_value # - assign a complete row (mixed values) -> exp_single_row # - assign multiple rows (mixed values) (-> array) -> exp_multi_row # - assign a part of a column with dtype == categorical -> exp_parts_cats_col # - assign a part of a column with dtype != categorical -> exp_parts_cats_col - cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) + cats = pd.Categorical(["a","a","a","a","a","a","a"], categories=["a","b"]) idx = pd.Index(["h","i","j","k","l","m","n"]) values = [1,1,1,1,1,1,1] orig = pd.DataFrame({"cats":cats,"values":values}, index=idx) ### the expected values # changed single row - cats1 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + cats1 = pd.Categorical(["a","a","b","a","a","a","a"], categories=["a","b"]) idx1 = pd.Index(["h","i","j","k","l","m","n"]) values1 = [1,1,2,1,1,1,1] exp_single_row = pd.DataFrame({"cats":cats1,"values":values1}, index=idx1) #changed multiple rows - cats2 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + cats2 = pd.Categorical(["a","a","b","b","a","a","a"], categories=["a","b"]) idx2 = pd.Index(["h","i","j","k","l","m","n"]) values2 = [1,1,2,2,1,1,1] exp_multi_row = pd.DataFrame({"cats":cats2,"values":values2}, index=idx2) # changed part of the cats column - cats3 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + cats3 = pd.Categorical(["a","a","b","b","a","a","a"], categories=["a","b"]) idx3 = pd.Index(["h","i","j","k","l","m","n"]) values3 = [1,1,1,1,1,1,1] exp_parts_cats_col = pd.DataFrame({"cats":cats3,"values":values3}, index=idx3) # changed single value in cats col - cats4 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + cats4 = pd.Categorical(["a","a","b","a","a","a","a"], categories=["a","b"]) idx4 = pd.Index(["h","i","j","k","l","m","n"]) values4 = [1,1,1,1,1,1,1] exp_single_cats_value = pd.DataFrame({"cats":cats4,"values":values4}, index=idx4) @@ -1253,7 +1783,13 @@ def test_assigning_ops(self): df.iloc[2,0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + + df = orig.copy() + df.iloc[df.index == "j",0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + + # - assign a single value not in the current categories set def f(): df = orig.copy() df.iloc[2,0] = "c" @@ -1264,7 +1800,7 @@ def f(): df.iloc[2,:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.iloc[2,:] = ["c",2] @@ -1282,18 +1818,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b"]) + df.iloc[2:4,0] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.iloc[2:4,0] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.iloc[2:4,0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.iloc[2:4,0] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1310,7 +1846,11 @@ def f(): df.loc["j","cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + df = orig.copy() + df.loc[df.index == "j","cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set def f(): df = orig.copy() df.loc["j","cats"] = "c" @@ -1321,7 +1861,7 @@ def f(): df.loc["j",:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.loc["j",:] = ["c",2] @@ -1339,18 +1879,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.loc["j":"k","cats"] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.loc["j":"k","cats"] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1367,7 +1907,11 @@ def f(): df.ix["j",0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + df = orig.copy() + df.ix[df.index == "j",0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set def f(): df = orig.copy() df.ix["j",0] = "c" @@ -1378,7 +1922,7 @@ def f(): df.ix["j",:] = ["b",2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in level set + # - assign a complete row (mixed values) not in categories set def f(): df = orig.copy() df.ix["j",:] = ["c",2] @@ -1396,18 +1940,18 @@ def f(): # - assign a part of a column with dtype == categorical -> exp_parts_cats_col df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b"]) + df.ix["j":"k",0] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with tm.assertRaises(ValueError): - # different levels -> not sure if this should fail or pass + # different categories -> not sure if this should fail or pass df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + df.ix["j":"k",0] = pd.Categorical(["b","b"], categories=["a","b","c"]) with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k",0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + df.ix["j":"k",0] = pd.Categorical(["c","c"], categories=["a","b","c"]) # - assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() @@ -1422,7 +1966,7 @@ def f(): df.iat[2,0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.iat[2,0] = "c" @@ -1434,20 +1978,20 @@ def f(): df.at["j","cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current level set + # - assign a single value not in the current categories set def f(): df = orig.copy() df.at["j","cats"] = "c" self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical(["a","a","c","c","a","a","a"], levels=["a","b","c"]) + catsf = pd.Categorical(["a","a","c","c","a","a","a"], categories=["a","b","c"]) idxf = pd.Index(["h","i","j","k","l","m","n"]) valuesf = [1,1,3,3,1,1,1] df = pd.DataFrame({"cats":catsf,"values":valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.levels = ["a","b","c"] + exp_fancy["cats"].cat.set_categories(["a","b","c"], inplace=True) df[df["cats"] == "c"] = ["b",2] tm.assert_frame_equal(df, exp_multi_row) @@ -1465,79 +2009,165 @@ def f(): # Assigning a Category to parts of a int/... column uses the values of the Catgorical df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) exp = pd.DataFrame({"a":[1,"b","b",1,1], "b":["a","a","b","b","a"]}) - df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) - df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], categories=["a","b"]) tm.assert_frame_equal(df, exp) + ######### Series ########## + orig = Series(pd.Categorical(["b","b"], categories=["a","b"])) + s = orig.copy() + s[:] = "a" + exp = Series(pd.Categorical(["a","a"], categories=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[1] = "a" + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[s.index > 0] = "a" + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[[False, True]] = "a" + exp = Series(pd.Categorical(["b","a"], categories=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.index = ["x", "y"] + s["y"] = "a" + exp = Series(pd.Categorical(["b","a"], categories=["a","b"]), index=["x", "y"]) + tm.assert_series_equal(s, exp) + + # ensure that one can set something to np.nan + s = Series(Categorical([1,2,3])) + exp = Series(Categorical([1,np.nan,3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + + + def test_comparisons(self): + tests_data = [(list("abc"), list("cba"), list("bbb")), + ([1,2,3], [3,2,1], [2,2,2])] + for data , reverse, base in tests_data: + cat_rev = pd.Series(pd.Categorical(data, categories=reverse)) + cat_rev_base = pd.Series(pd.Categorical(base, categories=reverse)) + cat = pd.Series(pd.Categorical(data)) + cat_base = pd.Series(pd.Categorical(base, categories=cat.cat.categories)) + s = Series(base) + a = np.array(base) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + # Only categories with same categories can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + # categorical cannot be compared to Series or numpy array, and also not the other way + # around + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + + # Categoricals can be compared to scalar values + res = cat_rev > base[0] + tm.assert_series_equal(res, exp) + + # And test NaN handling... + cat = pd.Series(pd.Categorical(["a","b","c", np.nan])) + exp = Series([True, True, True, False]) + res = (cat == cat) + tm.assert_series_equal(res, exp) def test_concat(self): - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + cat2 = pd.Categorical(["a","b","a","b"], categories=["a","b"]) vals2 = [1,2,1,2] exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) res = pd.concat([df,df]) tm.assert_frame_equal(exp, res) - # Concat should raise if the two categoricals do not have the same levels - cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + # Concat should raise if the two categoricals do not have the same categories + cat3 = pd.Categorical(["a","b"], categories=["a","b","c"]) vals3 = [1,2] - df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + df_wrong_categories = pd.DataFrame({"cats":cat3, "vals":vals3}) def f(): - pd.concat([df,df_wrong_levels]) + pd.concat([df,df_wrong_categories]) self.assertRaises(ValueError, f) # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df["grade"] = pd.Categorical(df["raw_grade"]) - df['grade'].cat.reorder_levels(['e', 'a', 'b']) + df['grade'].cat.set_categories(['e', 'a', 'b']) df1 = df[0:3] df2 = df[3:] - self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels) - self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels) + self.assert_numpy_array_equal(df['grade'].cat.categories, df1['grade'].cat.categories) + self.assert_numpy_array_equal(df['grade'].cat.categories, df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.levels - self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels) + dfx['grade'].cat.categories + self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) def test_append(self): - cat = pd.Categorical(["a","b"], levels=["a","b"]) + cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + cat2 = pd.Categorical(["a","b","a","b"], categories=["a","b"]) vals2 = [1,2,1,2] exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) res = df.append(df) tm.assert_frame_equal(exp, res) - # Concat should raise if the two categoricals do not have the same levels - cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + # Concat should raise if the two categoricals do not have the same categories + cat3 = pd.Categorical(["a","b"], categories=["a","b","c"]) vals3 = [1,2] - df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + df_wrong_categories = pd.DataFrame({"cats":cat3, "vals":vals3}) def f(): - df.append(df_wrong_levels) + df.append(df_wrong_categories) self.assertRaises(ValueError, f) def test_na_actions(self): - cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) + cat = pd.Categorical([1,2,3,np.nan], categories=[1,2,3]) vals = ["a","b",np.nan,"d"] df = pd.DataFrame({"cats":cat, "vals":vals}) - cat2 = pd.Categorical([1,2,3,3], levels=[1,2,3]) + cat2 = pd.Categorical([1,2,3,3], categories=[1,2,3]) vals2 = ["a","b","b","d"] df_exp_fill = pd.DataFrame({"cats":cat2, "vals":vals2}) - cat3 = pd.Categorical([1,2,3], levels=[1,2,3]) + cat3 = pd.Categorical([1,2,3], categories=[1,2,3]) vals3 = ["a","b",np.nan] df_exp_drop_cats = pd.DataFrame({"cats":cat3, "vals":vals3}) - cat4 = pd.Categorical([1,2], levels=[1,2,3]) + cat4 = pd.Categorical([1,2], categories=[1,2,3]) vals4 = ["a","b"] df_exp_drop_all = pd.DataFrame({"cats":cat4, "vals":vals4}) @@ -1558,6 +2188,16 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) + # make sure that fillna takes both missing values and NA categories into account + c = Categorical(["a","b",np.nan]) + c.set_categories(["a","b",np.nan], rename=True, inplace=True) + c[0] = np.nan + df = pd.DataFrame({"cats":c, "vals":[1,2,3]}) + df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]}) + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_astype_to_other(self): s = self.cat['value_group'] @@ -1607,6 +2247,34 @@ def test_numeric_like_ops(self): # invalid ufunc self.assertRaises(TypeError, lambda : np.log(s)) + def test_cat_tab_completition(self): + # test the tab completion display + ok_for_cat = ['categories','codes','ordered','set_categories', + 'add_categories', 'remove_categories', 'rename_categories', + 'reorder_categories', 'remove_unused_categories'] + def get_dir(s): + results = [ r for r in s.cat.__dir__() if not r.startswith('_') ] + return list(sorted(set(results))) + + s = Series(list('aabbcde')).astype('category') + results = get_dir(s) + tm.assert_almost_equal(results,list(sorted(set(ok_for_cat)))) + + def test_pickle_v0_14_1(self): + cat = pd.Categorical(values=['a', 'b', 'c'], + levels=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 5e91adbe1a2fa..0d13b6513b377 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -274,11 +274,6 @@ def test_repr_binary_type(): assert_equal(res, b) -def test_rands(): - r = com.rands(10) - assert(len(r) == 10) - - def test_adjoin(): data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index e60c9d5bd0fdf..dc5e9a67bdb65 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -59,6 +59,7 @@ def test_register_option(self): # no python keywords self.assertRaises(ValueError, self.cf.register_option, 'for', 0) + self.assertRaises(ValueError, self.cf.register_option, 'a.for.b', 0) # must be valid identifier (ensure attribute access works) self.assertRaises(ValueError, self.cf.register_option, 'Oh my Goddess!', 0) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index c6a9192d7bb79..89d08d37e0a30 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -43,7 +43,7 @@ def has_info_repr(df): def has_non_verbose_info_repr(df): has_info = has_info_repr(df) r = repr(df) - nv = len(r.split('\n')) == 4 # 1. , 2. Index, 3. Columns, 4. dtype + nv = len(r.split('\n')) == 6 # 1. , 2. Index, 3. Columns, 4. dtype, 5. memory usage, 6. trailing newline return has_info and nv def has_horizontally_truncated_repr(df): @@ -280,6 +280,36 @@ def mkframe(n): com.pprint_thing(df._repr_fits_horizontal_()) self.assertTrue(has_expanded_repr(df)) + def test_auto_detect(self): + term_width, term_height = get_terminal_size() + fac = 1.05 # Arbitrary large factor to exceed term widht + cols = range(int(term_width * fac)) + index = range(10) + df = DataFrame(index=index, columns=cols) + with option_context('mode.sim_interactive', True): + with option_context('max_rows',None): + with option_context('max_columns',None): + # Wrap around with None + self.assertTrue(has_expanded_repr(df)) + with option_context('max_rows',0): + with option_context('max_columns',0): + # Truncate with auto detection. + self.assertTrue(has_horizontally_truncated_repr(df)) + + index = range(int(term_height * fac)) + df = DataFrame(index=index, columns=cols) + with option_context('max_rows',0): + with option_context('max_columns',None): + # Wrap around with None + self.assertTrue(has_expanded_repr(df)) + # Truncate vertically + self.assertTrue(has_vertically_truncated_repr(df)) + + with option_context('max_rows',None): + with option_context('max_columns',0): + self.assertTrue(has_horizontally_truncated_repr(df)) + + def test_to_string_repr_unicode(self): buf = StringIO() @@ -385,6 +415,13 @@ def test_to_string_with_col_space(self): c30 = len(df.to_string(col_space=30).split("\n")[1]) self.assertTrue(c10 < c20 < c30) + # GH 8230 + # col_space wasn't being applied with header=False + with_header = df.to_string(col_space=20) + with_header_row1 = with_header.splitlines()[1] + no_header = df.to_string(col_space=20, header=False) + self.assertEqual(len(with_header_row1), len(no_header)) + def test_to_string_truncate_indices(self): for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex ]: @@ -1164,9 +1201,8 @@ def test_pprint_thing(self): def test_wide_repr(self): with option_context('mode.sim_interactive', True, 'display.show_dimensions', True): - col = lambda l, k: [tm.rands(k) for _ in range(l)] max_cols = get_option('display.max_columns') - df = DataFrame([col(max_cols - 1, 25) for _ in range(10)]) + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) set_option('display.expand_frame_repr', False) rep_str = repr(df) @@ -1190,9 +1226,8 @@ def test_wide_repr_wide_columns(self): def test_wide_repr_named(self): with option_context('mode.sim_interactive', True): - col = lambda l, k: [tm.rands(k) for _ in range(l)] max_cols = get_option('display.max_columns') - df = DataFrame([col(max_cols-1, 25) for _ in range(10)]) + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) df.index.name = 'DataFrame Index' set_option('display.expand_frame_repr', False) @@ -1212,11 +1247,10 @@ def test_wide_repr_named(self): def test_wide_repr_multiindex(self): with option_context('mode.sim_interactive', True): - col = lambda l, k: [tm.rands(k) for _ in range(l)] - midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), - np.array(col(10, 5))]) + midx = pandas.MultiIndex.from_arrays( + tm.rands_array(5, size=(2, 10))) max_cols = get_option('display.max_columns') - df = DataFrame([col(max_cols-1, 25) for _ in range(10)], + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx) df.index.names = ['Level 0', 'Level 1'] set_option('display.expand_frame_repr', False) @@ -1237,12 +1271,11 @@ def test_wide_repr_multiindex(self): def test_wide_repr_multiindex_cols(self): with option_context('mode.sim_interactive', True): max_cols = get_option('display.max_columns') - col = lambda l, k: [tm.rands(k) for _ in range(l)] - midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), - np.array(col(10, 5))]) - mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)), - np.array(col(max_cols-1, 3))]) - df = DataFrame([col(max_cols-1, 25) for _ in range(10)], + midx = pandas.MultiIndex.from_arrays( + tm.rands_array(5, size=(2, 10))) + mcols = pandas.MultiIndex.from_arrays( + tm.rands_array(3, size=(2, max_cols - 1))) + df = DataFrame(tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols) df.index.names = ['Level 0', 'Level 1'] set_option('display.expand_frame_repr', False) @@ -1259,9 +1292,8 @@ def test_wide_repr_multiindex_cols(self): def test_wide_repr_unicode(self): with option_context('mode.sim_interactive', True): - col = lambda l, k: [tm.randu(k) for _ in range(l)] max_cols = get_option('display.max_columns') - df = DataFrame([col(max_cols-1, 25) for _ in range(10)]) + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) set_option('display.expand_frame_repr', False) rep_str = repr(df) set_option('display.expand_frame_repr', True) @@ -1840,30 +1872,31 @@ def test_repr_html(self): self.reset_display_options() def test_repr_html_wide(self): - row = lambda l, k: [tm.rands(k) for _ in range(l)] max_cols = get_option('display.max_columns') - df = DataFrame([row(max_cols-1, 25) for _ in range(10)]) + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) reg_repr = df._repr_html_() assert "..." not in reg_repr - wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)]) + wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1))) wide_repr = wide_df._repr_html_() assert "..." in wide_repr def test_repr_html_wide_multiindex_cols(self): - row = lambda l, k: [tm.rands(k) for _ in range(l)] max_cols = get_option('display.max_columns') - tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar'])) - mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols) + mcols = pandas.MultiIndex.from_product([np.arange(max_cols//2), + ['foo', 'bar']], + names=['first', 'second']) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), + columns=mcols) reg_repr = df._repr_html_() assert '...' not in reg_repr - - tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar'])) - mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols) + mcols = pandas.MultiIndex.from_product((np.arange(1+(max_cols//2)), + ['foo', 'bar']), + names=['first', 'second']) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), + columns=mcols) wide_repr = df._repr_html_() assert '...' in wide_repr @@ -2083,6 +2116,31 @@ def test_to_latex(self): """ self.assertEqual(withoutindex_result, withoutindex_expected) + def test_to_latex_multiindex(self): + df = DataFrame({('x', 'y'): ['a']}) + result = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & x \\ +{} & y \\ +\midrule +0 & a \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(result, expected) + + result = df.T.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & 0 \\ +\midrule +x & y & a \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(result, expected) + def test_to_latex_escape(self): a = 'a' b = 'b' @@ -2428,33 +2486,33 @@ def test_timedelta64(self): o = Series([datetime(2012,1,1,microsecond=150)]*3) y = s-o result = y.to_string() - self.assertTrue('-0 days, 00:00:00.000150' in result) + self.assertTrue('-1 days +23:59:59.999850' in result) # rounding? o = Series([datetime(2012,1,1,1)]*3) y = s-o result = y.to_string() - self.assertTrue('-0 days, 01:00:00' in result) - self.assertTrue('1 days, 23:00:00' in result) + self.assertTrue('-1 days +23:00:00' in result) + self.assertTrue('1 days 23:00:00' in result) o = Series([datetime(2012,1,1,1,1)]*3) y = s-o result = y.to_string() - self.assertTrue('-0 days, 01:01:00' in result) - self.assertTrue('1 days, 22:59:00' in result) + self.assertTrue('-1 days +22:59:00' in result) + self.assertTrue('1 days 22:59:00' in result) o = Series([datetime(2012,1,1,1,1,microsecond=150)]*3) y = s-o result = y.to_string() - self.assertTrue('-0 days, 01:01:00.000150' in result) - self.assertTrue('1 days, 22:58:59.999850' in result) + self.assertTrue('-1 days +22:58:59.999850' in result) + self.assertTrue('0 days 22:58:59.999850' in result) # neg time td = timedelta(minutes=5,seconds=3) s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td y = s - s2 result = y.to_string() - self.assertTrue('-00:05:03' in result) + self.assertTrue('-1 days +23:54:57' in result) td = timedelta(microseconds=550) s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td @@ -2462,6 +2520,11 @@ def test_timedelta64(self): result = y.to_string() self.assertTrue('2012-01-01 23:59:59.999450' in result) + # no boxing of the actual elements + td = Series(pd.timedelta_range('1 days',periods=3)) + result = td.to_string() + self.assertEqual(result,u("0 1 days\n1 2 days\n2 3 days")) + def test_mixed_datetime64(self): df = DataFrame({'A': [1, 2], 'B': ['2012-01-01', '2012-01-02']}) @@ -2727,33 +2790,51 @@ def test_format(self): class TestRepr_timedelta64(tm.TestCase): - def test_legacy(self): + + def test_none(self): delta_1d = pd.to_timedelta(1, unit='D') delta_0d = pd.to_timedelta(0, unit='D') delta_1s = pd.to_timedelta(1, unit='s') delta_500ms = pd.to_timedelta(500, unit='ms') - self.assertEqual(tslib.repr_timedelta64(delta_1d), "1 days, 00:00:00") - self.assertEqual(tslib.repr_timedelta64(-delta_1d), "-1 days, 00:00:00") - self.assertEqual(tslib.repr_timedelta64(delta_0d), "00:00:00") - self.assertEqual(tslib.repr_timedelta64(delta_1s), "00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_500ms), "00:00:00.500000") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s), "1 days, 00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms), "1 days, 00:00:00.500000") + drepr = lambda x: x._repr_base() + self.assertEqual(drepr(delta_1d), "1 days") + self.assertEqual(drepr(-delta_1d), "-1 days") + self.assertEqual(drepr(delta_0d), "0 days") + self.assertEqual(drepr(delta_1s), "0 days 00:00:01") + self.assertEqual(drepr(delta_500ms), "0 days 00:00:00.500000") + self.assertEqual(drepr(delta_1d + delta_1s), "1 days 00:00:01") + self.assertEqual(drepr(delta_1d + delta_500ms), "1 days 00:00:00.500000") - def test_short(self): + def test_even_day(self): delta_1d = pd.to_timedelta(1, unit='D') delta_0d = pd.to_timedelta(0, unit='D') delta_1s = pd.to_timedelta(1, unit='s') delta_500ms = pd.to_timedelta(500, unit='ms') - self.assertEqual(tslib.repr_timedelta64(delta_1d, format='short'), "1 days") - self.assertEqual(tslib.repr_timedelta64(-delta_1d, format='short'), "-1 days") - self.assertEqual(tslib.repr_timedelta64(delta_0d, format='short'), "00:00:00") - self.assertEqual(tslib.repr_timedelta64(delta_1s, format='short'), "00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_500ms, format='short'), "00:00:00.500000") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s, format='short'), "1 days, 00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms, format='short'), "1 days, 00:00:00.500000") + drepr = lambda x: x._repr_base(format='even_day') + self.assertEqual(drepr(delta_1d), "1 days") + self.assertEqual(drepr(-delta_1d), "-1 days") + self.assertEqual(drepr(delta_0d), "0 days") + self.assertEqual(drepr(delta_1s), "0 days 00:00:01") + self.assertEqual(drepr(delta_500ms), "0 days 00:00:00.500000") + self.assertEqual(drepr(delta_1d + delta_1s), "1 days 00:00:01") + self.assertEqual(drepr(delta_1d + delta_500ms), "1 days 00:00:00.500000") + + def test_sub_day(self): + delta_1d = pd.to_timedelta(1, unit='D') + delta_0d = pd.to_timedelta(0, unit='D') + delta_1s = pd.to_timedelta(1, unit='s') + delta_500ms = pd.to_timedelta(500, unit='ms') + + drepr = lambda x: x._repr_base(format='sub_day') + self.assertEqual(drepr(delta_1d), "1 days") + self.assertEqual(drepr(-delta_1d), "-1 days") + self.assertEqual(drepr(delta_0d), "00:00:00") + self.assertEqual(drepr(delta_1s), "00:00:01") + self.assertEqual(drepr(delta_500ms), "00:00:00.500000") + self.assertEqual(drepr(delta_1d + delta_1s), "1 days 00:00:01") + self.assertEqual(drepr(delta_1d + delta_500ms), "1 days 00:00:00.500000") def test_long(self): delta_1d = pd.to_timedelta(1, unit='D') @@ -2761,65 +2842,69 @@ def test_long(self): delta_1s = pd.to_timedelta(1, unit='s') delta_500ms = pd.to_timedelta(500, unit='ms') - self.assertEqual(tslib.repr_timedelta64(delta_1d, format='long'), "1 days, 00:00:00") - self.assertEqual(tslib.repr_timedelta64(-delta_1d, format='long'), "-1 days, 00:00:00") - self.assertEqual(tslib.repr_timedelta64(delta_0d, format='long'), "0 days, 00:00:00") - self.assertEqual(tslib.repr_timedelta64(delta_1s, format='long'), "0 days, 00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_500ms, format='long'), "0 days, 00:00:00.500000") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s, format='long'), "1 days, 00:00:01") - self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms, format='long'), "1 days, 00:00:00.500000") + drepr = lambda x: x._repr_base(format='long') + self.assertEqual(drepr(delta_1d), "1 days 00:00:00") + self.assertEqual(drepr(-delta_1d), "-1 days +00:00:00") + self.assertEqual(drepr(delta_0d), "0 days 00:00:00") + self.assertEqual(drepr(delta_1s), "0 days 00:00:01") + self.assertEqual(drepr(delta_500ms), "0 days 00:00:00.500000") + self.assertEqual(drepr(delta_1d + delta_1s), "1 days 00:00:01") + self.assertEqual(drepr(delta_1d + delta_500ms), "1 days 00:00:00.500000") + def test_all(self): + delta_1d = pd.to_timedelta(1, unit='D') + delta_0d = pd.to_timedelta(0, unit='D') + delta_1ns = pd.to_timedelta(1, unit='ns') -class TestTimedelta64Formatter(tm.TestCase): - def test_mixed(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') - result = fmt.Timedelta64Formatter(x + y).get_result() - self.assertEqual(result[0].strip(), "0 days, 00:00:00") - self.assertEqual(result[1].strip(), "1 days, 00:00:01") + drepr = lambda x: x._repr_base(format='all') + self.assertEqual(drepr(delta_1d), "1 days 00:00:00.000000000") + self.assertEqual(drepr(delta_0d), "0 days 00:00:00.000000000") + self.assertEqual(drepr(delta_1ns), "0 days 00:00:00.000000001") - def test_mixed_neg(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') - result = fmt.Timedelta64Formatter(-(x + y)).get_result() - self.assertEqual(result[0].strip(), "0 days, 00:00:00") - self.assertEqual(result[1].strip(), "-1 days, 00:00:01") +class TestTimedelta64Formatter(tm.TestCase): def test_days(self): x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') - result = fmt.Timedelta64Formatter(x).get_result() + result = fmt.Timedelta64Formatter(x,box=True).get_result() + self.assertEqual(result[0].strip(), "'0 days'") + self.assertEqual(result[1].strip(), "'1 days'") + + result = fmt.Timedelta64Formatter(x[1:2],box=True).get_result() + self.assertEqual(result[0].strip(), "'1 days'") + + result = fmt.Timedelta64Formatter(x,box=False).get_result() self.assertEqual(result[0].strip(), "0 days") self.assertEqual(result[1].strip(), "1 days") - result = fmt.Timedelta64Formatter(x[1:2]).get_result() + result = fmt.Timedelta64Formatter(x[1:2],box=False).get_result() self.assertEqual(result[0].strip(), "1 days") def test_days_neg(self): x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') - result = fmt.Timedelta64Formatter(-x).get_result() - self.assertEqual(result[0].strip(), "0 days") - self.assertEqual(result[1].strip(), "-1 days") + result = fmt.Timedelta64Formatter(-x,box=True).get_result() + self.assertEqual(result[0].strip(), "'0 days'") + self.assertEqual(result[1].strip(), "'-1 days'") def test_subdays(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') - result = fmt.Timedelta64Formatter(y).get_result() - self.assertEqual(result[0].strip(), "00:00:00") - self.assertEqual(result[1].strip(), "00:00:01") + result = fmt.Timedelta64Formatter(y,box=True).get_result() + self.assertEqual(result[0].strip(), "'00:00:00'") + self.assertEqual(result[1].strip(), "'00:00:01'") def test_subdays_neg(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') - result = fmt.Timedelta64Formatter(-y).get_result() - self.assertEqual(result[0].strip(), "00:00:00") - self.assertEqual(result[1].strip(), "-00:00:01") + result = fmt.Timedelta64Formatter(-y,box=True).get_result() + self.assertEqual(result[0].strip(), "'00:00:00'") + self.assertEqual(result[1].strip(), "'-1 days +23:59:59'") def test_zero(self): x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit='D') - result = fmt.Timedelta64Formatter(x).get_result() - self.assertEqual(result[0].strip(), "0 days") + result = fmt.Timedelta64Formatter(x,box=True).get_result() + self.assertEqual(result[0].strip(), "'0 days'") x = pd.to_timedelta(list(range(1)), unit='D') - result = fmt.Timedelta64Formatter(x).get_result() - self.assertEqual(result[0].strip(), "0 days") + result = fmt.Timedelta64Formatter(x,box=True).get_result() + self.assertEqual(result[0].strip(), "'0 days'") class TestDatetime64Formatter(tm.TestCase): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cf845a18092af..3f4d825a4b82e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -32,7 +32,8 @@ import pandas.core.format as fmt import pandas.core.datetools as datetools from pandas import (DataFrame, Index, Series, notnull, isnull, - MultiIndex, DatetimeIndex, Timestamp, date_range, read_csv, + MultiIndex, DatetimeIndex, Timestamp, date_range, + read_csv, timedelta_range, Timedelta, option_context) import pandas as pd from pandas.parser import CParserError @@ -223,6 +224,134 @@ def test_setitem_list_of_tuples(self): expected = Series(tuples, index=self.frame.index) assert_series_equal(result, expected) + def test_setitem_mulit_index(self): + # GH7655, test that assigning to a sub-frame of a frame + # with multi-index columns aligns both rows and columns + it = ['jim', 'joe', 'jolie'], ['first', 'last'], \ + ['left', 'center', 'right'] + + cols = MultiIndex.from_product(it) + index = pd.date_range('20141006',periods=20) + vals = np.random.randint(1, 1000, (len(index), len(cols))) + df = pd.DataFrame(vals, columns=cols, index=index) + + i, j = df.index.values.copy(), it[-1][:] + + np.random.shuffle(i) + df['jim'] = df['jolie'].loc[i, ::-1] + assert_frame_equal(df['jim'], df['jolie']) + + np.random.shuffle(j) + df[('joe', 'first')] = df[('jolie', 'last')].loc[i, j] + assert_frame_equal(df[('joe', 'first')], df[('jolie', 'last')]) + + np.random.shuffle(j) + df[('joe', 'last')] = df[('jolie', 'first')].loc[i, j] + assert_frame_equal(df[('joe', 'last')], df[('jolie', 'first')]) + + def test_inplace_ops_alignment(self): + + # inplace ops / ops alignment + # GH 8511 + + columns = list('abcdefg') + X_orig = DataFrame(np.arange(10*len(columns)).reshape(-1,len(columns)), columns=columns, index=range(10)) + Z = 100*X_orig.iloc[:,1:-1].copy() + block1 = list('bedcf') + subs = list('bcdef') + + # add + X = X_orig.copy() + result1 = (X[block1] + Z).reindex(columns=subs) + + X[block1] += Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] + Z[block1]).reindex(columns=subs) + + X[block1] += Z[block1] + result4 = X.reindex(columns=subs) + + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + assert_frame_equal(result1, result4) + + # sub + X = X_orig.copy() + result1 = (X[block1] - Z).reindex(columns=subs) + + X[block1] -= Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] - Z[block1]).reindex(columns=subs) + + X[block1] -= Z[block1] + result4 = X.reindex(columns=subs) + + assert_frame_equal(result1, result2) + assert_frame_equal(result1, result3) + assert_frame_equal(result1, result4) + + def test_inplace_ops_identity(self): + + # GH 5104 + # make sure that we are actually changing the object + s_orig = Series([1, 2, 3]) + df_orig = DataFrame(np.random.randint(0,5,size=10).reshape(-1,5)) + + # no dtype change + s = s_orig.copy() + s2 = s + s += 1 + assert_series_equal(s,s2) + assert_series_equal(s_orig+1,s) + self.assertIs(s,s2) + self.assertIs(s._data,s2._data) + + df = df_orig.copy() + df2 = df + df += 1 + assert_frame_equal(df,df2) + assert_frame_equal(df_orig+1,df) + self.assertIs(df,df2) + self.assertIs(df._data,df2._data) + + # dtype change + s = s_orig.copy() + s2 = s + s += 1.5 + assert_series_equal(s,s2) + assert_series_equal(s_orig+1.5,s) + + df = df_orig.copy() + df2 = df + df += 1.5 + assert_frame_equal(df,df2) + assert_frame_equal(df_orig+1.5,df) + self.assertIs(df,df2) + self.assertIs(df._data,df2._data) + + # mixed dtype + arr = np.random.randint(0,10,size=5) + df_orig = DataFrame({'A' : arr.copy(), 'B' : 'foo'}) + df = df_orig.copy() + df2 = df + df['A'] += 1 + expected = DataFrame({'A' : arr.copy()+1, 'B' : 'foo'}) + assert_frame_equal(df,expected) + assert_frame_equal(df2,expected) + self.assertIs(df._data,df2._data) + + df = df_orig.copy() + df2 = df + df['A'] += 1.5 + expected = DataFrame({'A' : arr.copy()+1.5, 'B' : 'foo'}) + assert_frame_equal(df,expected) + assert_frame_equal(df2,expected) + self.assertIs(df._data,df2._data) + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] @@ -533,6 +662,15 @@ def test_setitem_cast(self): self.frame['something'] = 2.5 self.assertEqual(self.frame['something'].dtype, np.float64) + # GH 7704 + # dtype conversion on setting + df = DataFrame(np.random.rand(30, 3), columns=tuple('ABC')) + df['event'] = np.nan + df.loc[10,'event'] = 'foo' + result = df.get_dtype_counts().order() + expected = Series({'float64' : 3, 'object' : 1 }).order() + assert_series_equal(result, expected) + def test_setitem_boolean_column(self): expected = self.frame.copy() mask = self.frame['A'] > 0 @@ -1395,6 +1533,7 @@ def test_setitem_frame(self): # key is unaligned with values f = self.mixed_frame.copy() piece = f.ix[:2, ['A']] + piece.index = f.index[-2:] key = (slice(-2, None), ['A', 'B']) f.ix[key] = piece piece['B'] = np.nan @@ -1790,6 +1929,20 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[[1, -1], 0] tm.assert_series_equal(df.loc[0.2, 'a'], expect) + def test_setitem_with_sparse_value(self): + # GH8131 + df = pd.DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + sp_series = pd.Series([0, 0, 1]).to_sparse(fill_value=0) + df['new_column'] = sp_series + tm.assert_series_equal(df['new_column'], sp_series) + + def test_setitem_with_unaligned_sparse_value(self): + df = pd.DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + sp_series = (pd.Series([0, 0, 1], index=[2, 1, 0]) + .to_sparse(fill_value=0)) + df['new_column'] = sp_series + tm.assert_series_equal(df['new_column'], pd.Series([1, 0, 0])) + _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -3499,6 +3652,23 @@ def check(result, expected=None): result = z.ix[['a', 'c', 'a']] check(result,expected) + + def test_column_dups_indexing2(self): + + # GH 8363 + # datetime ops with a non-unique index + df = DataFrame({'A' : np.arange(5,dtype='int64'), + 'B' : np.arange(1,6,dtype='int64')}, + index=[2,2,3,3,4]) + result = df.B-df.A + expected = Series(1,index=[2,2,3,3,4]) + assert_series_equal(result,expected) + + df = DataFrame({'A' : date_range('20130101',periods=5), 'B' : date_range('20130101 09:00:00', periods=5)},index=[2,2,3,3,4]) + result = df.B-df.A + expected = Series(Timedelta('9 hours'),index=[2,2,3,3,4]) + assert_series_equal(result,expected) + def test_insert_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 @@ -3613,6 +3783,18 @@ def test_constructor_with_datetimes(self): self.assertEqual(df.iat[0,0],dt) assert_series_equal(df.dtypes,Series({'End Date' : np.dtype('object') })) + # tz-aware (UTC and other tz's) + # GH 8411 + dr = date_range('20130101',periods=3) + df = DataFrame({ 'value' : dr}) + self.assertTrue(df.iat[0,0].tz is None) + dr = date_range('20130101',periods=3,tz='UTC') + df = DataFrame({ 'value' : dr}) + self.assertTrue(str(df.iat[0,0].tz) == 'UTC') + dr = date_range('20130101',periods=3,tz='US/Eastern') + df = DataFrame({ 'value' : dr}) + self.assertTrue(str(df.iat[0,0].tz) == 'US/Eastern') + # GH 7822 # preserver an index with a tz on dict construction i = date_range('1/1/2011', periods=5, freq='10s', tz = 'US/Eastern') @@ -3958,6 +4140,13 @@ def test_to_dict(self): for k2, v2 in compat.iteritems(v): self.assertEqual(v2, recons_data[k][k2]) + recons_data = DataFrame(test_data).to_dict("sp") + + expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], + 'data': [[1.0, '1'], [2.0, '2'], [nan, '3']]} + + tm.assert_almost_equal(recons_data, expected_split) + recons_data = DataFrame(test_data).to_dict("r") expected_records = [{'A': 1.0, 'B': '1'}, @@ -4017,7 +4206,7 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r'Shape of passed values is \(3,\), indices imply \(3, 1\)' + msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' with assertRaisesRegexp(ValueError, msg): DataFrame.from_records(arr, index=index[:-1]) @@ -4545,7 +4734,7 @@ def test_bytestring_with_unicode(self): def test_very_wide_info_repr(self): df = DataFrame(np.random.randn(10, 20), - columns=[tm.rands(10) for _ in range(20)]) + columns=tm.rands_array(10, 20)) repr(df) def test_repr_column_name_unicode_truncation_bug(self): @@ -4712,6 +4901,19 @@ def test_operators(self): df = DataFrame({'a': ['a', None, 'b']}) assert_frame_equal(df + df, DataFrame({'a': ['aa', np.nan, 'bb']})) + def test_ops_np_scalar(self): + vals, xs = np.random.rand(5, 3), [nan, 7, -23, 2.718, -3.14, np.inf] + f = lambda x: DataFrame(x, index=list('ABCDE'), + columns=['jim', 'joe', 'jolie']) + + df = f(vals) + + for x in xs: + assert_frame_equal(df / np.array(x), f(vals / x)) + assert_frame_equal(np.array(x) * df, f(vals * x)) + assert_frame_equal(df + np.array(x), f(vals + x)) + assert_frame_equal(np.array(x) - df, f(x - vals)) + def test_operators_boolean(self): # GH 5808 @@ -4880,7 +5082,6 @@ def test_div(self): self.assertFalse(np.array_equal(res.fillna(0), res2.fillna(0))) def test_logical_operators(self): - import operator def _check_bin_op(op): result = op(df1, df2) @@ -6415,6 +6616,14 @@ def test_to_csv_from_csv_categorical(self): df2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue()) + def test_to_csv_path_is_none(self): + # GH 8215 + # Make sure we return string for consistency with + # Series.to_csv() + csv_str = self.frame.to_csv(path=None) + self.assertIsInstance(csv_str, str) + recons = pd.read_csv(StringIO(csv_str), index_col=0) + assert_frame_equal(self.frame, recons) def test_info(self): io = StringIO() @@ -6472,38 +6681,87 @@ def test_info_shows_column_dtypes(self): def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(4, None), (4, False), (9, True)]: + for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) - for len_, verbose in [(9, None), (4, False), (9, True)]: + for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) - for len_, max_cols in [(9, 5), (4, 4)]: + for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) + def test_info_memory_usage(self): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + self.assertTrue("memory usage: " in res[-1]) + # do not display memory usage cas + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + self.assertTrue("memory usage: " not in res[-1]) + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + # memory usage is a lower bound, so print it as XYZ+ MB + self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + # excluded column with object dtype, so estimate is accurate + self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) + + df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) + + # Test a DataFrame with duplicate columns + dtypes = ['int64', 'int64', 'int64', 'float64'] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + # Ensure df size is as expected + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 # cols * rows * bytes + self.assertEqual(df_size, exp_size) + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) # index=False; default + self.assertEqual(size_df, np.size(df.memory_usage())) def test_dtypes(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 @@ -7138,6 +7396,8 @@ def test_dropna_corner(self): # bad input self.assertRaises(ValueError, self.frame.dropna, how='foo') self.assertRaises(TypeError, self.frame.dropna, how=None) + # non-existent column - 8303 + self.assertRaises(KeyError, self.frame.dropna, subset=['A','X']) def test_dropna_multiple_axes(self): df = DataFrame([[1, np.nan, 2, 3], @@ -7568,6 +7828,29 @@ def test_fillna_dict_series(self): with assertRaisesRegexp(NotImplementedError, 'column by column'): df.fillna(df.max(1), axis=1) + def test_fillna_dataframe(self): + # GH 8377 + df = DataFrame({'a': [nan, 1, 2, nan, nan], + 'b': [1, 2, 3, nan, nan], + 'c': [nan, 1, 2, 3, 4]}, + index = list('VWXYZ')) + + # df2 may have different index and columns + df2 = DataFrame({'a': [nan, 10, 20, 30, 40], + 'b': [50, 60, 70, 80, 90], + 'foo': ['bar']*5}, + index = list('VWXuZ')) + + result = df.fillna(df2) + + # only those columns and indices which are shared get filled + expected = DataFrame({'a': [nan, 1, 2, nan, 40], + 'b': [1, 2, 3, nan, 90], + 'c': [nan, 1, 2, 3, 4]}, + index = list('VWXYZ')) + + assert_frame_equal(result, expected) + def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan @@ -7581,6 +7864,7 @@ def test_fillna_columns(self): expected = df.astype(float).fillna(method='ffill', axis=1) assert_frame_equal(result, expected) + def test_fillna_invalid_method(self): with assertRaisesRegexp(ValueError, 'ffil'): self.frame.fillna(method='ffil') @@ -7590,6 +7874,8 @@ def test_fillna_invalid_value(self): self.assertRaises(TypeError, self.frame.fillna, [1, 2]) # tuple self.assertRaises(TypeError, self.frame.fillna, (1, 2)) + # frame with series + self.assertRaises(ValueError, self.frame.iloc[:,0].fillna, self.frame) def test_replace_inplace(self): self.tsframe['A'][:5] = nan @@ -8953,8 +9239,8 @@ def test_align(self): self.assertTrue(bf.columns.equals(other.columns)) # test fill value join_idx = self.frame.index.join(other.index) - diff_a = self.frame.index.diff(join_idx) - diff_b = other.index.diff(join_idx) + diff_a = self.frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values diff_b_vals = bf.reindex(diff_b).values self.assertTrue((diff_a_vals == -1).all()) @@ -8972,8 +9258,8 @@ def test_align(self): # test fill value join_idx = self.frame.index.join(other.index) - diff_a = self.frame.index.diff(join_idx) - diff_b = other.index.diff(join_idx) + diff_a = self.frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values diff_b_vals = bf.reindex(diff_b).values self.assertTrue((diff_a_vals == -1).all()) @@ -9494,6 +9780,18 @@ def test_diff(self): assert_series_equal(the_diff['A'], tf['A'] - tf['A'].shift(1)) + def test_diff_timedelta(self): + # GH 4533 + df = DataFrame(dict(time=[Timestamp('20130101 9:01'), + Timestamp('20130101 9:02')], + value=[1.0,2.0])) + + res = df.diff() + exp = DataFrame([[pd.NaT, np.nan], + [Timedelta('00:01:00'), 1]], + columns=['time', 'value']) + assert_frame_equal(res, exp) + def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) @@ -9610,6 +9908,13 @@ def test_shift_bool(self): columns=['high', 'low']) assert_frame_equal(rs, xp) + def test_shift_empty(self): + # Regression test for #8019 + df = DataFrame({'foo': []}) + rs = df.shift(-1) + + assert_frame_equal(df, rs) + def test_tshift(self): # PeriodIndex ps = tm.makePeriodFrame() @@ -9996,6 +10301,13 @@ def test_applymap(self): result = df.applymap(str) assert_frame_equal(result,expected) + # datetime/timedelta + df['datetime'] = Timestamp('20130101') + df['timedelta'] = Timedelta('1 min') + result = df.applymap(str) + for f in ['datetime','timedelta']: + self.assertEquals(result.loc[0,f],str(df.loc[0,f])) + def test_filter(self): # items filtered = self.frame.filter(['A', 'B', 'E']) @@ -11857,6 +12169,17 @@ def test_unstack_non_unique_index_names(self): with tm.assertRaises(ValueError): df.T.stack('c1') + def test_stack_datetime_column_multiIndex(self): + # GH 8039 + t = datetime(2014, 1, 1) + df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')])) + result = df.stack() + + eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)]) + ecols = MultiIndex.from_tuples([(t, 'A')]) + expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) + assert_frame_equal(result, expected) + def test_repr_with_mi_nat(self): df = DataFrame({'X': [1, 2]}, index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) @@ -12136,6 +12459,42 @@ def test_construction_with_mixed(self): 'timedelta64[ns]' : 1}).order() assert_series_equal(result,expected) + def test_construction_with_conversions(self): + + # convert from a numpy array of non-ns timedelta64 + arr = np.array([1,2,3],dtype='timedelta64[s]') + s = Series(arr) + expected = Series(timedelta_range('00:00:01',periods=3,freq='s')) + assert_series_equal(s,expected) + + df = DataFrame(index=range(3)) + df['A'] = arr + expected = DataFrame({'A' : timedelta_range('00:00:01',periods=3,freq='s')}, + index=range(3)) + assert_frame_equal(df,expected) + + # convert from a numpy array of non-ns datetime64 + #### note that creating a numpy datetime64 is in LOCAL time!!!! + #### seems to work for M8[D], but not for M8[s] + + s = Series(np.array(['2013-01-01','2013-01-02','2013-01-03'],dtype='datetime64[D]')) + assert_series_equal(s,Series(date_range('20130101',periods=3,freq='D'))) + #s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) + #assert_series_equal(s,date_range('20130101 00:00:01',period=3,freq='s')) + + expected = DataFrame({ + 'dt1' : Timestamp('20130101'), + 'dt2' : date_range('20130101',periods=3), + #'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), + },index=range(3)) + + + df = DataFrame(index=range(3)) + df['dt1'] = np.datetime64('2013-01-01') + df['dt2'] = np.array(['2013-01-01','2013-01-02','2013-01-03'],dtype='datetime64[D]') + #df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') + assert_frame_equal(df, expected) + def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 @@ -12635,8 +12994,8 @@ def test_consolidate_datetime64(self): df.starting = ser_starting.index df.ending = ser_ending.index - assert_array_equal(df.starting.values, ser_starting.index.values) - assert_array_equal(df.ending.values, ser_ending.index.values) + tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index) + tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index) def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 001d6f489e934..1fe1b552649ed 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -123,6 +123,23 @@ def test_get_numeric_data(self): # _get_numeric_data is includes _get_bool_data, so can't test for non-inclusion + def test_get_default(self): + + # GH 7725 + d0 = "a", "b", "c", "d" + d1 = np.arange(4, dtype='int64') + others = "e", 10 + + for data, index in ((d0, d1), (d1, d0)): + s = Series(data, index=index) + for i,d in zip(index, data): + self.assertEqual(s.get(i), d) + self.assertEqual(s.get(i, d), d) + self.assertEqual(s.get(i, "z"), d) + for other in others: + self.assertEqual(s.get(other, "z"), "z") + self.assertEqual(s.get(other, other), other) + def test_nonzero(self): # GH 4633 @@ -484,7 +501,7 @@ def test_interp_regression(self): ser = Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75])) interp_s = ser.reindex(new_index).interpolate(method='pchip') # does not blow up, GH5977 interp_s[49:51] @@ -639,6 +656,13 @@ def test_interp_datetime64(self): expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected) + def test_interp_limit_no_nans(self): + # GH 7173 + s = pd.Series([1., 2., 3.]) + result = s.interpolate(limit=1) + expected = s + assert_series_equal(result, expected) + def test_describe(self): _ = self.series.describe() _ = self.ts.describe() @@ -981,18 +1005,17 @@ def test_describe_objects(self): df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')}) df.loc[4] = pd.Timestamp('2010-01-04') result = df.describe() - expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'), - pd.Timestamp('2010-01-04'), - pd.Timestamp('2010-01-04'), 2]}, - index=['count', 'unique', 'first', 'last', 'top', - 'freq']) + expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-04'), 2, + pd.Timestamp('2010-01-01'), + pd.Timestamp('2010-01-04')]}, + index=['count', 'unique', 'top', 'freq', + 'first', 'last']) assert_frame_equal(result, expected) # mix time and str df['C2'] = ['a', 'a', 'b', 'c', 'a'] result = df.describe() - # when mix of dateimte / obj the index gets reordered. - expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3] + expected['C2'] = [5, 3, 'a', 3, np.nan, np.nan] assert_frame_equal(result, expected) # just str @@ -1012,6 +1035,112 @@ def test_describe_objects(self): assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe()) + def test_describe_typefiltering(self): + df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, + 'catB': ['a', 'b', 'c', 'd'] * 6, + 'numC': np.arange(24, dtype='int64'), + 'numD': np.arange(24.) + .5, + 'ts': tm.makeTimeSeries()[:24].index}) + + descN = df.describe() + expected_cols = ['numC', 'numD',] + expected = DataFrame(dict((k, df[k].describe()) + for k in expected_cols), + columns=expected_cols) + assert_frame_equal(descN, expected) + + desc = df.describe(include=['number']) + assert_frame_equal(desc, descN) + desc = df.describe(exclude=['object', 'datetime']) + assert_frame_equal(desc, descN) + desc = df.describe(include=['float']) + assert_frame_equal(desc, descN.drop('numC',1)) + + descC = df.describe(include=['O']) + expected_cols = ['catA', 'catB'] + expected = DataFrame(dict((k, df[k].describe()) + for k in expected_cols), + columns=expected_cols) + assert_frame_equal(descC, expected) + + descD = df.describe(include=['datetime']) + assert_series_equal( descD.ts, df.ts.describe()) + + desc = df.describe(include=['object','number', 'datetime']) + assert_frame_equal(desc.loc[:,["numC","numD"]].dropna(), descN) + assert_frame_equal(desc.loc[:,["catA","catB"]].dropna(), descC) + descDs = descD.sort_index() # the index order change for mixed-types + assert_frame_equal(desc.loc[:,"ts":].dropna().sort_index(), descDs) + + desc = df.loc[:,'catA':'catB'].describe(include='all') + assert_frame_equal(desc, descC) + desc = df.loc[:,'numC':'numD'].describe(include='all') + assert_frame_equal(desc, descN) + + desc = df.describe(percentiles = [], include='all') + cnt = Series(data=[4,4,6,6,6], index=['catA','catB','numC','numD','ts']) + assert_series_equal( desc.count(), cnt) + self.assertTrue('count' in desc.index) + self.assertTrue('unique' in desc.index) + self.assertTrue('50%' in desc.index) + self.assertTrue('first' in desc.index) + + desc = df.drop("ts", 1).describe(percentiles = [], include='all') + assert_series_equal( desc.count(), cnt.drop("ts")) + self.assertTrue('first' not in desc.index) + desc = df.drop(["numC","numD"], 1).describe(percentiles = [], include='all') + assert_series_equal( desc.count(), cnt.drop(["numC","numD"])) + self.assertTrue('50%' not in desc.index) + + def test_describe_typefiltering_category_bool(self): + df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8), + 'B_str': ['a', 'b', 'c', 'd'] * 6, + 'C_bool': [True] * 12 + [False] * 12, + 'D_num': np.arange(24.) + .5, + 'E_ts': tm.makeTimeSeries()[:24].index}) + + # bool is considered numeric in describe, although not an np.number + desc = df.describe() + expected_cols = ['C_bool', 'D_num'] + expected = DataFrame(dict((k, df[k].describe()) + for k in expected_cols), + columns=expected_cols) + assert_frame_equal(desc, expected) + + desc = df.describe(include=["category"]) + self.assertTrue(desc.columns.tolist() == ["A_cat"]) + + # 'all' includes numpy-dtypes + category + desc1 = df.describe(include="all") + desc2 = df.describe(include=[np.generic, "category"]) + assert_frame_equal(desc1, desc2) + + def test_describe_timedelta(self): + df = DataFrame({"td": pd.to_timedelta(np.arange(24)%20,"D")}) + self.assertTrue(df.describe().loc["mean"][0] == pd.to_timedelta("8d4h")) + + def test_describe_typefiltering_dupcol(self): + df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, + 'catB': ['a', 'b', 'c', 'd'] * 6, + 'numC': np.arange(24), + 'numD': np.arange(24.) + .5, + 'ts': tm.makeTimeSeries()[:24].index}) + s = df.describe(include='all').shape[1] + df = pd.concat([df, df], axis=1) + s2 = df.describe(include='all').shape[1] + self.assertTrue(s2 == 2 * s) + + def test_describe_typefiltering_groupby(self): + df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, + 'catB': ['a', 'b', 'c', 'd'] * 6, + 'numC': np.arange(24), + 'numD': np.arange(24.) + .5, + 'ts': tm.makeTimeSeries()[:24].index}) + G = df.groupby('catA') + self.assertTrue(G.describe(include=['number']).shape == (16, 2)) + self.assertTrue(G.describe(include=['number', 'object']).shape == (22, 3)) + self.assertTrue(G.describe(include='all').shape == (26, 4)) + def test_no_order(self): tm._skip_if_no_scipy() s = Series([0, 1, np.nan, 3]) @@ -1129,7 +1258,7 @@ def test_tz_convert_and_localize(self): # MultiIndex # GH7846 - df2 = DataFrame(np.ones(5), + df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) df3 = getattr(df2, fn)('US/Pacific', level=0) @@ -1280,6 +1409,21 @@ def test_equals(self): df2 = df1.set_index(['floats'], append=True) self.assertTrue(df3.equals(df2)) + # GH 8437 + a = pd.Series([False, np.nan]) + b = pd.Series([False, np.nan]) + c = pd.Series(index=range(2)) + d = pd.Series(index=range(2)) + e = pd.Series(index=range(2)) + f = pd.Series(index=range(2)) + c[:-1] = d[:-1] = e[0] = f[0] = False + self.assertTrue(a.equals(a)) + self.assertTrue(a.equals(b)) + self.assertTrue(a.equals(c)) + self.assertTrue(a.equals(d)) + self.assertFalse(a.equals(e)) + self.assertTrue(e.equals(f)) + def test_describe_raises(self): with tm.assertRaises(NotImplementedError): tm.makePanel().describe() diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 64254cd480f2a..45814795ec060 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -28,6 +28,15 @@ import pandas.tools.plotting as plotting +def _skip_if_mpl_14_or_dev_boxplot(): + # GH 8382 + # Boxplot failures on 1.4 and 1.4.1 + # Don't need try / except since that's done at class level + import matplotlib + if str(matplotlib.__version__) >= LooseVersion('1.4'): + raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") + + def _skip_if_no_scipy_gaussian_kde(): try: import scipy @@ -63,6 +72,11 @@ def setUp(self): 'weight': random.normal(161, 32, size=n), 'category': random.randint(4, size=n)}) + if str(mpl.__version__) >= LooseVersion('1.4'): + self.bp_n_objects = 7 + else: + self.bp_n_objects = 8 + def tearDown(self): tm.close() @@ -349,7 +363,6 @@ def _check_has_errorbars(self, axes, xerr=0, yerr=0): yerr : number expected number of y errorbar """ - axes = self._flatten_visible(axes) for ax in axes: containers = ax.containers @@ -365,7 +378,8 @@ def _check_has_errorbars(self, axes, xerr=0, yerr=0): self.assertEqual(xerr, xerr_count) self.assertEqual(yerr, yerr_count) - def _check_box_return_type(self, returned, return_type, expected_keys=None): + def _check_box_return_type(self, returned, return_type, expected_keys=None, + check_ax_title=True): """ Check box returned type is correct @@ -377,6 +391,10 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None): expected_keys : list-like, optional group labels in subplot case. If not passed, the function checks assuming boxplot uses single ax + check_ax_title : bool + Whether to check the ax.title is the same as expected_key + Intended to be checked by calling from ``boxplot``. + Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. """ from matplotlib.axes import Axes types = {'dict': dict, 'axes': Axes, 'both': tuple} @@ -402,14 +420,17 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None): self.assertTrue(isinstance(value, types[return_type])) # check returned dict has correct mapping if return_type == 'axes': - self.assertEqual(value.get_title(), key) + if check_ax_title: + self.assertEqual(value.get_title(), key) elif return_type == 'both': - self.assertEqual(value.ax.get_title(), key) + if check_ax_title: + self.assertEqual(value.ax.get_title(), key) self.assertIsInstance(value.ax, Axes) self.assertIsInstance(value.lines, dict) elif return_type == 'dict': line = value['medians'][0] - self.assertEqual(line.get_axes().get_title(), key) + if check_ax_title: + self.assertEqual(line.get_axes().get_title(), key) else: raise AssertionError @@ -452,7 +473,7 @@ def test_plot(self): _check_plot_works(self.ts.plot, kind='area', stacked=False) _check_plot_works(self.iseries.plot) - for kind in ['line', 'bar', 'barh', 'kde', 'hist']: + for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: if not _ok_for_gaussian_kde(kind): continue _check_plot_works(self.series[:5].plot, kind=kind) @@ -465,6 +486,11 @@ def test_plot(self): ax = _check_plot_works(self.ts.plot, subplots=True) self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + @slow def test_plot_figsize_and_title(self): # figsize and title @@ -472,6 +498,12 @@ def test_plot_figsize_and_title(self): self._check_text_labels(ax.title, 'Test') self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) + def test_dont_modify_rcParams(self): + # GH 8242 + colors = self.plt.rcParams['axes.color_cycle'] + Series([1, 2, 3]).plot() + self.assertEqual(colors, self.plt.rcParams['axes.color_cycle']) + def test_ts_line_lim(self): ax = self.ts.plot() xmin, xmax = ax.get_xlim() @@ -555,6 +587,10 @@ def test_bar_ignore_index(self): def test_rotation(self): df = DataFrame(randn(5, 5)) + # Default rot 0 + axes = df.plot() + self._check_ticks_props(axes, xrot=0) + axes = df.plot(rot=30) self._check_ticks_props(axes, xrot=30) @@ -613,7 +649,14 @@ def test_pie_series(self): series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], name='YLABEL') ax = _check_plot_works(series.plot, kind='pie') - self._check_text_labels(ax.texts, series.index) + self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) + + def test_pie_nan(self): + s = Series([1, np.nan, 1, 1]) + ax = s.plot(kind='pie', legend=True) + expected = ['0', '', '2', '3'] + result = [x.get_text() for x in ax.texts] + self.assertEqual(result, expected) @slow def test_hist_df_kwargs(self): @@ -664,13 +707,25 @@ def test_hist_layout_with_by(self): axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 2), figsize=(12, 7)) + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) @slow @@ -738,6 +793,14 @@ def test_kde_kwargs(self): self._check_ax_scales(ax, yaxis='log') self._check_text_labels(ax.yaxis.get_label(), 'Density') + @slow + def test_kde_missing_vals(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + s = Series(np.random.uniform(size=50)) + s[0] = np.nan + ax = _check_plot_works(s.plot, kind='kde') + @slow def test_hist_kwargs(self): ax = self.ts.plot(kind='hist', bins=5) @@ -767,6 +830,15 @@ def test_hist_kde_color(self): self.assertEqual(len(lines), 1) self._check_colors(lines, ['r']) + @slow + def test_boxplot_series(self): + ax = self.ts.plot(kind='box', logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [self.ts.name]) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + @slow def test_autocorrelation_plot(self): from pandas.tools.plotting import autocorrelation_plot @@ -880,6 +952,7 @@ def setUp(self): mpl.rcdefaults() self.mpl_le_1_2_1 = str(mpl.__version__) <= LooseVersion('1.2.1') + self.mpl_ge_1_3_1 = str(mpl.__version__) >= LooseVersion('1.3.1') self.tdf = tm.makeTimeDataFrame() self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), @@ -896,7 +969,11 @@ def test_plot(self): _check_plot_works(df.plot, grid=False) axes = _check_plot_works(df.plot, subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - _check_plot_works(df.plot, subplots=True, use_index=False) + + axes = _check_plot_works(df.plot, subplots=True, layout=(-1, 2)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + axes = _check_plot_works(df.plot, subplots=True, use_index=False) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) @@ -922,7 +999,7 @@ def test_plot(self): self._check_visible(ax.xaxis) self._check_visible(ax.get_xticklabels()) self._check_visible([ax.xaxis.get_label()]) - self._check_ticks_props(ax, xrot=30) + self._check_ticks_props(ax, xrot=0) _check_plot_works(df.plot, title='blah') @@ -954,6 +1031,9 @@ def test_plot(self): axes = _check_plot_works(df.plot, kind='bar', subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + axes = _check_plot_works(df.plot, kind='bar', subplots=True, + layout=(-1, 1)) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) # When ax is supplied and required number of axes is 1, # passed ax should be used: fig, ax = self.plt.subplots() @@ -1071,6 +1151,7 @@ def test_subplots(self): for kind in ['bar', 'barh', 'line', 'area']: axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + self.assertEqual(axes.shape, (3, )) for ax, column in zip(axes, df.columns): self._check_legend_labels(ax, labels=[com.pprint_thing(column)]) @@ -1122,7 +1203,7 @@ def test_subplots_timeseries(self): self._check_visible(axes[-1].get_xticklabels(minor=True)) self._check_visible(axes[-1].xaxis.get_label()) self._check_visible(axes[-1].get_yticklabels()) - self._check_ticks_props(axes, xrot=30) + self._check_ticks_props(axes, xrot=0) axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: @@ -1133,6 +1214,103 @@ def test_subplots_timeseries(self): self._check_visible(ax.get_yticklabels()) self._check_ticks_props(ax, xlabelsize=7, xrot=45) + def test_subplots_layout(self): + # GH 6667 + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertEqual(axes.shape, (2, 2)) + + axes = df.plot(subplots=True, layout=(-1, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertEqual(axes.shape, (2, 2)) + + axes = df.plot(subplots=True, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertEqual(axes.shape, (2, 2)) + + axes = df.plot(subplots=True, layout=(1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + self.assertEqual(axes.shape, (1, 4)) + + axes = df.plot(subplots=True, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + self.assertEqual(axes.shape, (1, 4)) + + axes = df.plot(subplots=True, layout=(4, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(4, 1)) + self.assertEqual(axes.shape, (4, 1)) + + with tm.assertRaises(ValueError): + axes = df.plot(subplots=True, layout=(1, 1)) + with tm.assertRaises(ValueError): + axes = df.plot(subplots=True, layout=(-1, -1)) + + # single column + df = DataFrame(np.random.rand(10, 1), + index=list(string.ascii_letters[:10])) + axes = df.plot(subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self.assertEqual(axes.shape, (1, )) + + axes = df.plot(subplots=True, layout=(3, 3)) + self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) + self.assertEqual(axes.shape, (3, 3)) + + @slow + def test_subplots_multiple_axes(self): + # GH 5353, 6970, GH 7069 + fig, axes = self.plt.subplots(2, 3) + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + + returned = df.plot(subplots=True, ax=axes[0]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assertEqual(returned.shape, (3, )) + self.assertIs(returned[0].figure, fig) + # draw on second row + returned = df.plot(subplots=True, ax=axes[1]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assertEqual(returned.shape, (3, )) + self.assertIs(returned[0].figure, fig) + self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) + tm.close() + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + df.plot(subplots=True, ax=axes) + + # pass 2-dim axes and invalid layout + # invalid lauout should not affect to input and return value + # (show warning is tested in + # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes + fig, axes = self.plt.subplots(2, 2) + df = DataFrame(np.random.rand(10, 4), + index=list(string.ascii_letters[:10])) + + returned = df.plot(subplots=True, ax=axes, layout=(2, 1)) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + self.assertEqual(returned.shape, (4, )) + + returned = df.plot(subplots=True, ax=axes, layout=(2, -1)) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + self.assertEqual(returned.shape, (4, )) + + returned = df.plot(subplots=True, ax=axes, layout=(-1, 2)) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + self.assertEqual(returned.shape, (4, )) + + # single column + fig, axes = self.plt.subplots(1, 1) + df = DataFrame(np.random.rand(10, 1), + index=list(string.ascii_letters[:10])) + axes = df.plot(subplots=True, ax=[axes]) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self.assertEqual(axes.shape, (1, )) + def test_negative_log(self): df = - DataFrame(rand(6, 4), index=list(string.ascii_letters[:6]), @@ -1383,6 +1561,23 @@ def test_bar_bottom_left(self): result = [p.get_x() for p in ax.patches] self.assertEqual(result, [1] * 5) + @slow + def test_bar_nan(self): + df = DataFrame({'A': [10, np.nan, 20], 'B': [5, 10, 20], + 'C': [1, 2, 3]}) + ax = df.plot(kind='bar') + expected = [10, 0, 20, 5, 10, 20, 1, 2, 3] + result = [p.get_height() for p in ax.patches] + self.assertEqual(result, expected) + + ax = df.plot(kind='bar', stacked=True) + result = [p.get_height() for p in ax.patches] + self.assertEqual(result, expected) + + result = [p.get_y() for p in ax.patches] + expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] + self.assertEqual(result, expected) + @slow def test_plot_scatter(self): df = DataFrame(randn(6, 4), @@ -1401,6 +1596,37 @@ def test_plot_scatter(self): axes = df.plot(x='x', y='y', kind='scatter', subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + @slow + def test_plot_scatter_with_c(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['x', 'y', 'z', 'four']) + + axes = [df.plot(kind='scatter', x='x', y='y', c='z'), + df.plot(kind='scatter', x=0, y=1, c=2)] + for ax in axes: + # default to Greys + self.assertEqual(ax.collections[0].cmap.name, 'Greys') + + if self.mpl_ge_1_3_1: + + # n.b. there appears to be no public method to get the colorbar + # label + self.assertEqual(ax.collections[0].colorbar._label, 'z') + + cm = 'cubehelix' + ax = df.plot(kind='scatter', x='x', y='y', c='z', colormap=cm) + self.assertEqual(ax.collections[0].cmap.name, cm) + + # verify turning off colorbar works + ax = df.plot(kind='scatter', x='x', y='y', c='z', colorbar=False) + self.assertIs(ax.collections[0].colorbar, None) + + # verify that we can still plot a solid color + ax = df.plot(x=0, y=1, c='red', kind='scatter') + self.assertIs(ax.collections[0].colorbar, None) + self._check_colors(ax.collections, facecolors=['r']) + @slow def test_plot_bar(self): df = DataFrame(randn(6, 4), @@ -1578,6 +1804,102 @@ def test_bar_log_subplots(self): @slow def test_boxplot(self): + df = self.hist_df + series = df['height'] + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + + ax = _check_plot_works(df.plot, kind='box') + self._check_text_labels(ax.get_xticklabels(), labels) + assert_array_equal(ax.xaxis.get_ticklocs(), + np.arange(1, len(numeric_cols) + 1)) + self.assertEqual(len(ax.lines), + self.bp_n_objects * len(numeric_cols)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, kind='box', + subplots=True, logy=True) + self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) + self._check_ax_scales(axes, yaxis='log') + for ax, label in zip(axes, labels): + self._check_text_labels(ax.get_xticklabels(), [label]) + self.assertEqual(len(ax.lines), self.bp_n_objects) + + axes = series.plot(kind='box', rot=40) + self._check_ticks_props(axes, xrot=40, yrot=0) + tm.close() + + ax = _check_plot_works(series.plot, kind='box') + + positions = np.array([1, 6, 7]) + ax = df.plot(kind='box', positions=positions) + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + self._check_text_labels(ax.get_xticklabels(), labels) + assert_array_equal(ax.xaxis.get_ticklocs(), positions) + self.assertEqual(len(ax.lines), self.bp_n_objects * len(numeric_cols)) + + @slow + def test_boxplot_vertical(self): + df = self.hist_df + numeric_cols = df._get_numeric_data().columns + labels = [com.pprint_thing(c) for c in numeric_cols] + + # if horizontal, yticklabels are rotated + ax = df.plot(kind='box', rot=50, fontsize=8, vert=False) + self._check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) + self._check_text_labels(ax.get_yticklabels(), labels) + self.assertEqual(len(ax.lines), self.bp_n_objects * len(numeric_cols)) + + axes = _check_plot_works(df.plot, kind='box', subplots=True, + vert=False, logx=True) + self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) + self._check_ax_scales(axes, xaxis='log') + for ax, label in zip(axes, labels): + self._check_text_labels(ax.get_yticklabels(), [label]) + self.assertEqual(len(ax.lines), self.bp_n_objects) + + positions = np.array([3, 2, 8]) + ax = df.plot(kind='box', positions=positions, vert=False) + self._check_text_labels(ax.get_yticklabels(), labels) + assert_array_equal(ax.yaxis.get_ticklocs(), positions) + self.assertEqual(len(ax.lines), self.bp_n_objects * len(numeric_cols)) + + @slow + def test_boxplot_return_type(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + with tm.assertRaises(ValueError): + df.plot(kind='box', return_type='NOTATYPE') + + result = df.plot(kind='box', return_type='dict') + self._check_box_return_type(result, 'dict') + + result = df.plot(kind='box', return_type='axes') + self._check_box_return_type(result, 'axes') + + result = df.plot(kind='box', return_type='both') + self._check_box_return_type(result, 'both') + + @slow + def test_boxplot_subplots_return_type(self): + df = self.hist_df + + # normal style: return_type=None + result = df.plot(kind='box', subplots=True) + self.assertIsInstance(result, np.ndarray) + self._check_box_return_type(result, None, + expected_keys=['height', 'weight', 'category']) + + for t in ['dict', 'axes', 'both']: + returned = df.plot(kind='box', return_type=t, subplots=True) + self._check_box_return_type(returned, t, + expected_keys=['height', 'weight', 'category'], + check_ax_title=False) + + @slow + def test_boxplot_legacy(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), columns=['one', 'two', 'three', 'four']) @@ -1621,7 +1943,7 @@ def test_boxplot(self): self.assertEqual(len(ax.get_lines()), len(lines)) @slow - def test_boxplot_return_type(self): + def test_boxplot_return_type_legacy(self): # API change in https://github.com/pydata/pandas/pull/7096 import matplotlib as mpl @@ -1648,6 +1970,40 @@ def test_boxplot_return_type(self): result = df.boxplot(return_type='both') self._check_box_return_type(result, 'both') + @slow + def test_boxplot_axis_limits(self): + + def _check_ax_limits(col, ax): + y_min, y_max = ax.get_ylim() + self.assertTrue(y_min <= col.min()) + self.assertTrue(y_max >= col.max()) + + df = self.hist_df.copy() + df['age'] = np.random.randint(1, 20, df.shape[0]) + # One full row + height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + self.assertEqual(weight_ax._sharey, height_ax) + + # Two rows, one partial + p = df.boxplot(['height', 'weight', 'age'], by='category') + height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] + dummy_ax = p[1, 1] + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + _check_ax_limits(df['age'], age_ax) + self.assertEqual(weight_ax._sharey, height_ax) + self.assertEqual(age_ax._sharey, height_ax) + self.assertIsNone(dummy_ax._sharey) + + @slow + def test_boxplot_empty_column(self): + _skip_if_mpl_14_or_dev_boxplot() + df = DataFrame(np.random.randn(20, 4)) + df.loc[:, 0] = np.nan + _check_plot_works(df.boxplot, return_type='axes') + @slow def test_kde_df(self): tm._skip_if_no_scipy() @@ -1667,6 +2023,14 @@ def test_kde_df(self): axes = df.plot(kind='kde', logy=True, subplots=True) self._check_ax_scales(axes, yaxis='log') + @slow + def test_kde_missing_vals(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + df = DataFrame(np.random.uniform(size=(100, 4))) + df.loc[0, 0] = np.nan + ax = _check_plot_works(df.plot, kind='kde') + @slow def test_hist_df(self): df = DataFrame(randn(100, 4)) @@ -1850,6 +2214,10 @@ def test_hist_layout(self): {'layout': (4, 1), 'expected_size': (4, 1)}, {'layout': (1, 4), 'expected_size': (1, 4)}, {'layout': (3, 3), 'expected_size': (3, 3)}, + {'layout': (-1, 4), 'expected_size': (1, 4)}, + {'layout': (4, -1), 'expected_size': (4, 1)}, + {'layout': (-1, 2), 'expected_size': (2, 2)}, + {'layout': (2, -1), 'expected_size': (2, 2)} ) for layout_test in layout_to_expected_size: @@ -1864,6 +2232,9 @@ def test_hist_layout(self): # invalid format for layout with tm.assertRaises(ValueError): df.hist(layout=(1,)) + with tm.assertRaises(ValueError): + df.hist(layout=(-1, -1)) + @slow def test_scatter(self): @@ -2019,10 +2390,10 @@ def test_df_legend_labels(self): self._check_legend_labels(ax, labels=df.columns) ax = df3.plot(kind=kind, legend=True, ax=ax) - self._check_legend_labels(ax, labels=df.columns + df3.columns) + self._check_legend_labels(ax, labels=df.columns.union(df3.columns)) ax = df4.plot(kind=kind, legend='reverse', ax=ax) - expected = list(df.columns + df3.columns) + list(reversed(df4.columns)) + expected = list(df.columns.union(df3.columns)) + list(reversed(df4.columns)) self._check_legend_labels(ax, labels=expected) # Secondary Y @@ -2106,6 +2477,16 @@ def test_style_by_column(self): for i, l in enumerate(ax.get_lines()[:len(markers)]): self.assertEqual(l.get_marker(), markers[i]) + @slow + def test_line_label_none(self): + s = Series([1, 2]) + ax = s.plot() + self.assertEqual(ax.get_legend(), None) + + ax = s.plot(legend=True) + self.assertEqual(ax.get_legend().get_texts()[0].get_text(), + 'None') + @slow def test_line_colors(self): import sys @@ -2243,6 +2624,61 @@ def test_kde_colors(self): rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) self._check_colors(ax.get_lines(), linecolors=rgba_colors) + @slow + def test_boxplot_colors(self): + + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', fliers_c='b'): + self._check_colors(bp['boxes'], linecolors=[box_c] * len(bp['boxes'])) + self._check_colors(bp['whiskers'], linecolors=[whiskers_c] * len(bp['whiskers'])) + self._check_colors(bp['medians'], linecolors=[medians_c] * len(bp['medians'])) + self._check_colors(bp['fliers'], linecolors=[fliers_c] * len(bp['fliers'])) + self._check_colors(bp['caps'], linecolors=[caps_c] * len(bp['caps'])) + + default_colors = self.plt.rcParams.get('axes.color_cycle') + + df = DataFrame(randn(5, 5)) + bp = df.plot(kind='box', return_type='dict') + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) + tm.close() + + dict_colors = dict(boxes='#572923', whiskers='#982042', + medians='#804823', caps='#123456') + bp = df.plot(kind='box', color=dict_colors, sym='r+', return_type='dict') + _check_colors(bp, dict_colors['boxes'], dict_colors['whiskers'], + dict_colors['medians'], dict_colors['caps'], 'r') + tm.close() + + # partial colors + dict_colors = dict(whiskers='c', medians='m') + bp = df.plot(kind='box', color=dict_colors, return_type='dict') + _check_colors(bp, default_colors[0], 'c', 'm') + tm.close() + + from matplotlib import cm + # Test str -> colormap functionality + bp = df.plot(kind='box', colormap='jet', return_type='dict') + jet_colors = lmap(cm.jet, np.linspace(0, 1, 3)) + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # Test colormap functionality + bp = df.plot(kind='box', colormap=cm.jet, return_type='dict') + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # string color is applied to all artists except fliers + bp = df.plot(kind='box', color='DodgerBlue', return_type='dict') + _check_colors(bp, 'DodgerBlue', 'DodgerBlue', 'DodgerBlue', + 'DodgerBlue') + + # tuple is also applied to all artists except fliers + bp = df.plot(kind='box', color=(0, 1, 0), sym='#123456', return_type='dict') + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), '#123456') + + with tm.assertRaises(ValueError): + # Color contains invalid key results in ValueError + df.plot(kind='box', color=dict(boxes='red', xxxx='blue')) + def test_default_color_cycle(self): import matplotlib.pyplot as plt plt.rcParams['axes.color_cycle'] = list('rgbk') @@ -2364,6 +2800,9 @@ def test_pie_df(self): ax = _check_plot_works(df.plot, kind='pie', y='Y') self._check_text_labels(ax.texts, df.index) + ax = _check_plot_works(df.plot, kind='pie', y=2) + self._check_text_labels(ax.texts, df.index) + axes = _check_plot_works(df.plot, kind='pie', subplots=True) self.assertEqual(len(axes), len(df.columns)) for ax in axes: @@ -2381,6 +2820,26 @@ def test_pie_df(self): self._check_text_labels(ax.texts, labels) self._check_colors(ax.patches, facecolors=color_args) + def test_pie_df_nan(self): + df = DataFrame(np.random.rand(4, 4)) + for i in range(4): + df.iloc[i, i] = np.nan + fig, axes = self.plt.subplots(ncols=4) + df.plot(kind='pie', subplots=True, ax=axes, legend=True) + + base_expected = ['0', '1', '2', '3'] + for i, ax in enumerate(axes): + expected = list(base_expected) # force copy + expected[i] = '' + result = [x.get_text() for x in ax.texts] + self.assertEqual(result, expected) + # legend labels + # NaN's not included in legend with subplots + # see https://github.com/pydata/pandas/issues/8390 + self.assertEqual([x.get_text() for x in + ax.get_legend().get_texts()], + base_expected[:i] + base_expected[i+1:]) + def test_errorbar_plot(self): d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} df = DataFrame(d) @@ -2540,12 +2999,28 @@ def test_errorbar_scatter(self): self._check_has_errorbars(ax, xerr=0, yerr=0) ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', xerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=0) + ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', yerr=df_err) self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', xerr=df_err, yerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=1) + def _check_errorbar_color(containers, expected, has_err='has_xerr'): + errs = [c.lines[1][0] for c in ax.containers if getattr(c, has_err, False)] + self._check_colors(errs, linecolors=[expected] * len(errs)) + + # GH 8081 + df = DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e']) + ax = df.plot(kind='scatter', x='a', y='b', xerr='d', yerr='e', c='red') + self._check_has_errorbars(ax, xerr=1, yerr=1) + _check_errorbar_color(ax.containers, 'red', has_err='has_xerr') + _check_errorbar_color(ax.containers, 'red', has_err='has_yerr') + + ax = df.plot(kind='scatter', x='a', y='b', yerr='e', color='green') + self._check_has_errorbars(ax, xerr=0, yerr=1) + _check_errorbar_color(ax.containers, 'green', has_err='has_yerr') + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): @@ -2554,7 +3029,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): def test_boxplot(self): grouped = self.hist_df.groupby(by='gender') axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(axes.values(), axes_num=2, layout=(1, 2)) + self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') @@ -2566,7 +3041,7 @@ def test_boxplot(self): grouped = df.groupby(level=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(axes.values(), axes_num=10, layout=(4, 3)) + self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') @@ -2574,7 +3049,7 @@ def test_boxplot(self): grouped = df.unstack(level=1).groupby(level=0, axis=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(axes.values(), axes_num=3, layout=(2, 2)) + self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') @@ -2692,6 +3167,8 @@ def test_grouped_box_layout(self): by=df.gender, layout=(1, 1)) self.assertRaises(ValueError, df.boxplot, column=['height', 'weight', 'category'], layout=(2, 1), return_type='dict') + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(-1, -1)) box = _check_plot_works(df.groupby('gender').boxplot, column='height', return_type='dict') @@ -2724,15 +3201,64 @@ def test_grouped_box_layout(self): box = _check_plot_works(df.groupby('category').boxplot, column='height', layout=(3, 2), return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + box = _check_plot_works(df.groupby('category').boxplot, column='height', + layout=(3, -1), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', layout=(4, 1)) + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(4, 1)) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(-1, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) + box = df.groupby('classroom').boxplot( column=['height', 'weight', 'category'], layout=(1, 4), return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], layout=(1, -1), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) + + + @slow + def test_grouped_box_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + # check warning to ignore sharex / sharey + # this check should be done in the first function which + # passes multiple axes to plot, hist or boxplot + # location should be changed if other test is added + # which has earlier alphabetical order + with tm.assert_produces_warning(UserWarning): + fig, axes = self.plt.subplots(2, 2) + df.groupby('category').boxplot(column='height', return_type='axes', ax=axes) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + fig, axes = self.plt.subplots(2, 3) + returned = df.boxplot(column=['height', 'weight', 'category'], by='gender', + return_type='axes', ax=axes[0]) + returned = np.array(list(returned.values())) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + # draw on second row + returned = df.groupby('classroom').boxplot(column=['height', 'weight', 'category'], + return_type='axes', ax=axes[1]) + returned = np.array(list(returned.values())) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + axes = df.groupby('classroom').boxplot(ax=axes) + @slow def test_grouped_hist_layout(self): @@ -2741,16 +3267,26 @@ def test_grouped_hist_layout(self): layout=(1, 1)) self.assertRaises(ValueError, df.hist, column='height', by=df.category, layout=(1, 3)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(-1, -1)) + + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = _check_plot_works(df.hist, column='height', by=df.gender, layout=(2, 1)) + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, -1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = _check_plot_works(df.hist, column='height', by=df.category, layout=(4, 1)) + axes = df.hist(column='height', by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column='height', by=df.category, layout=(-1, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = _check_plot_works(df.hist, column='height', by=df.category, - layout=(4, 2), figsize=(12, 8)) + axes = df.hist(column='height', by=df.category, layout=(4, 2), figsize=(12, 8)) self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + tm.close() # GH 6769 axes = _check_plot_works(df.hist, column='height', by='classroom', layout=(2, 2)) @@ -2760,13 +3296,32 @@ def test_grouped_hist_layout(self): axes = _check_plot_works(df.hist, by='classroom') self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - axes = _check_plot_works(df.hist, by='gender', layout=(3, 5)) + axes = df.hist(by='gender', layout=(3, 5)) self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) - axes = _check_plot_works(df.hist, column=['height', 'weight', 'category']) + axes = df.hist(column=['height', 'weight', 'category']) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) @slow + def test_grouped_hist_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + fig, axes = self.plt.subplots(2, 3) + returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + returned = df.hist(by='classroom', ax=axes[1]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + axes = df.hist(column='height', ax=axes) + @slow def test_axis_share_x(self): df = self.hist_df # GH4089 diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6f39750de9d9b..7ead8b30e8671 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,7 +8,6 @@ from pandas import date_range,bdate_range, Timestamp from pandas.core.index import Index, MultiIndex, Int64Index -from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) @@ -19,7 +18,7 @@ assert_index_equal, assertRaisesRegexp) from pandas.compat import( range, long, lrange, StringIO, lmap, lzip, map, - zip, builtins, OrderedDict + zip, builtins, OrderedDict, product as cart_product ) from pandas import compat from pandas.core.panel import Panel @@ -313,6 +312,30 @@ def test_nth(self): expected = g.B.first() assert_series_equal(result,expected) + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], + columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) + assert_frame_equal(g.nth([3, 4]), df.loc[[],['B']]) + + business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, fourth and last two business days for each month + result = df.groupby((df.index.year, df.index.month)).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime(['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', + '2014/5/1', '2014/5/6', '2014/5/29', '2014/5/30', + '2014/6/2', '2014/6/5', '2014/6/27', '2014/6/30']) + expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + assert_frame_equal(result, expected) + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index @@ -327,6 +350,28 @@ def test_grouper_index_types(self): df.index = list(reversed(df.index.tolist())) df.groupby(list('abcde')).apply(lambda x: x) + def test_grouper_multilevel_freq(self): + + # GH 7885 + # with level and freq specified in a pd.Grouper + from datetime import date, timedelta + d0 = date.today() - timedelta(days=14) + dates = date_range(d0, date.today()) + date_index = pd.MultiIndex.from_product([dates, dates], names=['foo', 'bar']) + df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) + + # Check string level + expected = df.reset_index().groupby([pd.Grouper(key='foo', freq='W'), + pd.Grouper(key='bar', freq='W')]).sum() + result = df.groupby([pd.Grouper(level='foo', freq='W'), + pd.Grouper(level='bar', freq='W')]).sum() + assert_frame_equal(result, expected) + + # Check integer level + result = df.groupby([pd.Grouper(level=0, freq='W'), + pd.Grouper(level=1, freq='W')]).sum() + assert_frame_equal(result, expected) + def test_grouper_iter(self): self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) @@ -341,6 +386,19 @@ def test_groupby_grouper(self): expected = grouped.mean() assert_frame_equal(result, expected) + def test_groupby_duplicated_column_errormsg(self): + # GH7511 + df = DataFrame(columns=['A','B','A','C'], \ + data=[range(4), range(2,6), range(0, 8, 2)]) + + self.assertRaises(ValueError, df.groupby, 'A') + self.assertRaises(ValueError, df.groupby, ['A', 'B']) + + grouped = df.groupby('B') + c = grouped.count() + self.assertTrue(c.columns.nlevels == 1) + self.assertTrue(c.columns.size == 3) + def test_groupby_dict_mapping(self): # GH #679 from pandas import Series @@ -598,6 +656,23 @@ def test_get_group(self): self.assertRaises(ValueError, lambda : g.get_group(('foo'))) self.assertRaises(ValueError, lambda : g.get_group(('foo','bar','baz'))) + def test_get_group_grouped_by_tuple(self): + # GH 8121 + df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], + index=['ids']).T + gr = df.groupby('ids') + expected = DataFrame({'ids': [(1,), (1,)]}, index=[0, 2]) + result = gr.get_group((1,)) + assert_frame_equal(result, expected) + + dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', + '2010-01-02']) + df = DataFrame({'ids': [(x,) for x in dt]}) + gr = df.groupby('ids') + result = gr.get_group(('2010-01-01',)) + expected = DataFrame({'ids': [(dt[0],), (dt[0],)]}, index=[0, 2]) + assert_frame_equal(result, expected) + def test_agg_apply_corner(self): # nothing to group, all NA grouped = self.ts.groupby(self.ts * np.nan) @@ -632,6 +707,11 @@ def test_agg_grouping_is_list_tuple(self): expected = grouped.mean() tm.assert_frame_equal(result, expected) + def test_grouping_error_on_multidim_input(self): + from pandas.core.groupby import Grouping + self.assertRaises(ValueError, \ + Grouping, self.df.index, self.df[['A','A']]) + def test_agg_python_multiindex(self): grouped = self.mframe.groupby(['A', 'B']) @@ -796,6 +876,31 @@ def test_transform(self): transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame(np.arange(6,dtype='int64').reshape(3,2), columns=["a","b"], index=[0,2,1]) + key = [0,0,1] + expected = df.sort_index().groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean() + result = df.groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean() + assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame(np.random.randn(5, 5), + columns=['a', 'b', 'c', 'd', 'e'], + index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) + key = ['one', 'two', 'one', 'two', 'one'] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + assert_frame_equal(result, expected) + + # GH 8430 + df = tm.makeTimeDataFrame() + g = df.groupby(pd.TimeGrouper('M')) + g.transform(lambda x: x-1) + def test_transform_fast(self): df = DataFrame( { 'id' : np.arange( 100000 ) / 3, @@ -1394,6 +1499,16 @@ def test_groupby_as_index_agg(self): result3 = grouped['C'].agg({'Q': np.sum}) assert_frame_equal(result3, expected3) + def test_mulitindex_passthru(self): + + # GH 7997 + # regression from 0.14.1 + df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]]) + df.columns = pd.MultiIndex.from_tuples([(0,1),(1,1),(2,1)]) + + result = df.groupby(axis=1, level=[0,1]).first() + assert_frame_equal(result, df) + def test_multifunc_select_col_integer_cols(self): df = self.df df.columns = np.arange(len(df.columns)) @@ -1659,6 +1774,33 @@ def test_nonsense_func(self): df = DataFrame([0]) self.assertRaises(Exception, df.groupby, lambda x: x + 'foo') + def test_builtins_apply(self): # GH8155 + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), + columns=['jim', 'joe']) + df['jolie'] = np.random.randn(1000) + print(df.head()) + + for keys in ['jim', ['jim', 'joe']]: # single key & multi-key + if keys == 'jim': continue + for f in [max, min, sum]: + fname = f.__name__ + result = df.groupby(keys).apply(f) + _shape = result.shape + ngroups = len(df.drop_duplicates(subset=keys)) + assert result.shape == (ngroups, 3), 'invalid frame shape: '\ + '{} (expected ({}, 3))'.format(result.shape, ngroups) + + assert_frame_equal(result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname))) + + if f != sum: + expected = df.groupby(keys).agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + assert_frame_equal(result, expected, check_dtype=False) + + assert_series_equal(getattr(result, fname)(), + getattr(df, fname)()) + def test_cythonized_aggers(self): data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], 'B': ['A', 'B'] * 6, @@ -1969,6 +2111,18 @@ def f_no_copy(x): grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy) assert_series_equal(grpby_copy,grpby_no_copy) + def test_no_mutate_but_looks_like(self): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'value': range(9)}) + + result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) + assert_series_equal(result1, result2) + def test_apply_chunk_view(self): # Low level tinkering could be unsafe, make sure not df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], @@ -2129,6 +2283,20 @@ def test_count_object(self): expected = pd.Series([1, 3], index=[2, 3], name='a') tm.assert_series_equal(result, expected) + def test_count_cross_type(self): # GH8169 + vals = np.hstack((np.random.randint(0,5,(100,2)), + np.random.randint(0,2,(100,2)))) + + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df[df==2] = np.nan + expected = df.groupby(['c', 'd']).count() + + for t in ['float32', 'object']: + df['a'] = df['a'].astype(t) + df['b'] = df['b'].astype(t) + result = df.groupby(['c', 'd']).count() + tm.assert_frame_equal(result, expected) + def test_non_cython_api(self): # GH5610 @@ -2410,7 +2578,7 @@ def test_cython_grouper_series_bug_noncontig(self): self.assertTrue(result.isnull().all()) def test_series_grouper_noncontig_index(self): - index = Index([tm.rands(10) for _ in range(100)]) + index = Index(tm.rands_array(10, 100)) values = Series(np.random.randn(50), index=index[::2]) labels = np.random.randint(0, 5, 50) @@ -2700,8 +2868,8 @@ def test_column_select_via_attr(self): assert_frame_equal(result, expected) def test_rank_apply(self): - lev1 = np.array([rands(10) for _ in range(100)], dtype=object) - lev2 = np.array([rands(10) for _ in range(130)], dtype=object) + lev1 = tm.rands_array(10, 100) + lev2 = tm.rands_array(10, 130) lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) @@ -2924,7 +3092,7 @@ def __call__(self, x): lambda x: sum(x), lambda x: x.sum(), partial(sum), fn_class()] - + expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: result = df.groupby('foo').agg(ecall) @@ -3128,10 +3296,11 @@ def test_groupby_categorical_no_compress(self): cats = Categorical.from_codes(codes, [0, 1, 2, 3]) result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.levels) + exp = data.groupby(codes).mean().reindex(cats.categories) assert_series_equal(result, exp) - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a","b","c","d"]) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) result = data.groupby("b").mean() @@ -3174,10 +3343,11 @@ def test_groupby_categorical_unequal_len(self): import pandas as pd #GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - bins = pd.cut(series.dropna(), 4) + # The raises only happens with categorical, not with series of types category + bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here - self.assertRaises(AssertionError,lambda : series.groupby(bins).mean()) + self.assertRaises(ValueError,lambda : series.groupby(bins).mean()) def test_gb_apply_list_of_unequal_len_arrays(self): @@ -4188,7 +4358,45 @@ def test_groupby_whitelist(self): gb = obj.groupby(df.letters) self.assertEqual(whitelist, gb._apply_whitelist) for m in whitelist: - getattr(gb, m) + getattr(type(gb), m) + + AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'mad', 'std', 'var', 'sem'] + AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] + + def test_regression_whitelist_methods(self) : + + # GH6944 + # explicity test the whitelest methods + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + raw_frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + raw_frame.ix[1, [1, 2]] = np.nan + raw_frame.ix[7, [0, 1]] = np.nan + + for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, + lrange(2), lrange(2), + [True,False]) : + + if axis == 0 : + frame = raw_frame + else : + frame = raw_frame.T + + if op in self.AGG_FUNCTIONS_WITH_SKIPNA : + grouped = frame.groupby(level=level,axis=axis) + result = getattr(grouped,op)(skipna=skipna) + expected = getattr(frame,op)(level=level,axis=axis,skipna=skipna) + assert_frame_equal(result, expected) + else : + grouped = frame.groupby(level=level,axis=axis) + result = getattr(grouped,op)() + expected = getattr(frame,op)(level=level,axis=axis) + assert_frame_equal(result, expected) def test_groupby_blacklist(self): from string import ascii_lowercase @@ -4549,7 +4757,41 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - + def test_groupby_categorical_two_columns(self): + + # https://github.com/pydata/pandas/issues/8138 + d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"]), + 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, + index=pd.Index(["a", "b", "c"], name="cat")) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat","ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], + "cat": ["a","a","b","b","c","c"], + "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) + tm.assert_frame_equal(res, exp) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values,'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],[1,2,3,4]], + names=["cat", "C2"]) + exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], + "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) + tm.assert_frame_equal(res, exp) def assert_fp_equal(a, b): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 60105719179ad..3c5f3a8d6b6d3 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -16,6 +16,7 @@ from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, InvalidIndexError, NumericIndex) from pandas.tseries.index import DatetimeIndex +from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.period import PeriodIndex from pandas.core.series import Series from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, @@ -53,13 +54,13 @@ def test_numeric_compat(self): idx = self.create_index() tm.assertRaisesRegexp(TypeError, - "cannot perform multiplication", + "cannot perform __mul__", lambda : idx * 1) tm.assertRaisesRegexp(TypeError, - "cannot perform multiplication", + "cannot perform __mul__", lambda : 1 * idx) - div_err = "cannot perform true division" if compat.PY3 else "cannot perform division" + div_err = "cannot perform __truediv__" if compat.PY3 else "cannot perform __div__" tm.assertRaisesRegexp(TypeError, div_err, lambda : idx / 1) @@ -67,10 +68,10 @@ def test_numeric_compat(self): div_err, lambda : 1 / idx) tm.assertRaisesRegexp(TypeError, - "cannot perform floor division", + "cannot perform __floordiv__", lambda : idx // 1) tm.assertRaisesRegexp(TypeError, - "cannot perform floor division", + "cannot perform __floordiv__", lambda : 1 // idx) def test_boolean_context_compat(self): @@ -93,6 +94,7 @@ def setUp(self): dateIndex = tm.makeDateIndex(100), intIndex = tm.makeIntIndex(100), floatIndex = tm.makeFloatIndex(100), + boolIndex = Index([True,False]), empty = Index([]), tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], [1, 2, 3])) @@ -269,6 +271,15 @@ def test_view(self): i_view = i.view() self.assertEqual(i_view.name, 'Foo') + def test_legacy_pickle_identity(self): + + # GH 8431 + pth = tm.get_data_path() + s1 = pd.read_pickle(os.path.join(pth,'s1-0.12.0.pickle')) + s2 = pd.read_pickle(os.path.join(pth,'s2-0.12.0.pickle')) + self.assertFalse(s1.index.identical(s2.index)) + self.assertFalse(s1.index.equals(s2.index)) + def test_astype(self): casted = self.intIndex.astype('i8') @@ -390,6 +401,12 @@ def test_asof(self): d = self.dateIndex[0].to_datetime() tm.assert_isinstance(self.dateIndex.asof(d), Timestamp) + def test_asof_datetime_partial(self): + idx = pd.date_range('2010-01-01', periods=2, freq='m') + expected = Timestamp('2010-01-31') + result = idx.asof('2010-02') + self.assertEqual(result, expected) + def test_nanosecond_index_access(self): s = Series([Timestamp('20130101')]).values.view('i8')[0] r = DatetimeIndex([s + 50 + i for i in range(100)]) @@ -526,6 +543,13 @@ def test_intersection(self): self.assertTrue(tm.equalContents(result3, expected3)) self.assertEqual(result3.name, expected3.name) + # non-monotonic non-unique + idx1 = Index(['A','B','A','C']) + idx2 = Index(['B','D']) + expected = Index(['B'], dtype='object') + result = idx1.intersection(idx2) + self.assertTrue(result.equals(expected)) + def test_union(self): first = self.strIndex[5:20] second = self.strIndex[:10] @@ -557,8 +581,13 @@ def test_union(self): self.assertIsNone(union.name) def test_add(self): - firstCat = self.strIndex + self.dateIndex - secondCat = self.strIndex + self.strIndex + + # - API change GH 8226 + with tm.assert_produces_warning(): + self.strIndex + self.dateIndex + + firstCat = self.strIndex.union(self.dateIndex) + secondCat = self.strIndex.union(self.strIndex) if self.dateIndex.dtype == np.object_: appended = np.append(self.strIndex, self.dateIndex) @@ -611,29 +640,30 @@ def test_iadd_string(self): index += '_x' self.assertIn('a_x', index) - def test_diff(self): + def test_difference(self): + first = self.strIndex[5:20] second = self.strIndex[:10] answer = self.strIndex[10:20] first.name = 'name' # different names - result = first - second + result = first.difference(second) self.assertTrue(tm.equalContents(result, answer)) self.assertEqual(result.name, None) # same names second.name = 'name' - result = first - second + result = first.difference(second) self.assertEqual(result.name, 'name') # with empty - result = first.diff([]) + result = first.difference([]) self.assertTrue(tm.equalContents(result, first)) self.assertEqual(result.name, first.name) # with everythin - result = first.diff(first) + result = first.difference(first) self.assertEqual(len(result), 0) self.assertEqual(result.name, first.name) @@ -703,6 +733,13 @@ def test_is_numeric(self): self.assertTrue(self.intIndex.is_numeric()) self.assertTrue(self.floatIndex.is_numeric()) + def test_is_object(self): + self.assertTrue(self.strIndex.is_object()) + self.assertTrue(self.boolIndex.is_object()) + self.assertFalse(self.intIndex.is_object()) + self.assertFalse(self.dateIndex.is_object()) + self.assertFalse(self.floatIndex.is_object()) + def test_is_all_dates(self): self.assertTrue(self.dateIndex.is_all_dates) self.assertFalse(self.strIndex.is_all_dates) @@ -990,6 +1027,64 @@ def test_nan_first_take_datetime(self): exp = Index([idx[-1], idx[0], idx[1]]) tm.assert_index_equal(res, exp) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self): + # GH6552 + idx = pd.Index([0, 1, 2]) + + dt_idx = pd.date_range('20130101', periods=3) + + idx.name = None + self.assertEqual(idx.reindex([])[0].name, None) + self.assertEqual(idx.reindex(np.array([]))[0].name, None) + self.assertEqual(idx.reindex(idx.tolist())[0].name, None) + self.assertEqual(idx.reindex(idx.tolist()[:-1])[0].name, None) + self.assertEqual(idx.reindex(idx.values)[0].name, None) + self.assertEqual(idx.reindex(idx.values[:-1])[0].name, None) + + # Must preserve name even if dtype changes. + self.assertEqual(idx.reindex(dt_idx.values)[0].name, None) + self.assertEqual(idx.reindex(dt_idx.tolist())[0].name, None) + + idx.name = 'foobar' + self.assertEqual(idx.reindex([])[0].name, 'foobar') + self.assertEqual(idx.reindex(np.array([]))[0].name, 'foobar') + self.assertEqual(idx.reindex(idx.tolist())[0].name, 'foobar') + self.assertEqual(idx.reindex(idx.tolist()[:-1])[0].name, 'foobar') + self.assertEqual(idx.reindex(idx.values)[0].name, 'foobar') + self.assertEqual(idx.reindex(idx.values[:-1])[0].name, 'foobar') + + # Must preserve name even if dtype changes. + self.assertEqual(idx.reindex(dt_idx.values)[0].name, 'foobar') + self.assertEqual(idx.reindex(dt_idx.tolist())[0].name, 'foobar') + + def test_reindex_preserves_type_if_target_is_empty_list_or_array(self): + # GH7774 + idx = pd.Index(list('abc')) + def get_reindex_type(target): + return idx.reindex(target)[0].dtype.type + + self.assertEqual(get_reindex_type([]), np.object_) + self.assertEqual(get_reindex_type(np.array([])), np.object_) + self.assertEqual(get_reindex_type(np.array([], dtype=np.int64)), + np.object_) + + def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self): + # GH7774 + idx = pd.Index(list('abc')) + def get_reindex_type(target): + return idx.reindex(target)[0].dtype.type + + self.assertEqual(get_reindex_type(pd.Int64Index([])), np.int64) + self.assertEqual(get_reindex_type(pd.Float64Index([])), np.float64) + self.assertEqual(get_reindex_type(pd.DatetimeIndex([])), np.datetime64) + + reindexed = idx.reindex(pd.MultiIndex([pd.Int64Index([]), + pd.Float64Index([])], + [[], []]))[0] + self.assertEqual(reindexed.levels[0].dtype.type, np.int64) + self.assertEqual(reindexed.levels[1].dtype.type, np.float64) + + class Numeric(Base): @@ -1632,6 +1727,21 @@ def test_numeric_compat(self): lambda : pd.date_range('2000-01-01', periods=3) * np.timedelta64(1, 'D').astype('m8[ns]') ]: self.assertRaises(TypeError, f) + def test_roundtrip_pickle_with_tz(self): + + # GH 8367 + # round-trip of timezone + index=date_range('20130101',periods=3,tz='US/Eastern',name='foo') + unpickled = self.round_trip_pickle(index) + self.assertTrue(index.equals(unpickled)) + + def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): + # GH7774 + index = date_range('20130101', periods=3, tz='US/Eastern') + self.assertEqual(str(index.reindex([])[0].tz), 'US/Eastern') + self.assertEqual(str(index.reindex(np.array([]))[0].tz), 'US/Eastern') + + class TestPeriodIndex(Base, tm.TestCase): _holder = PeriodIndex _multiprocess_can_split_ = True @@ -1642,6 +1752,52 @@ def create_index(self): def test_pickle_compat_construction(self): pass +class TestTimedeltaIndex(Base, tm.TestCase): + _holder = TimedeltaIndex + _multiprocess_can_split_ = True + + def create_index(self): + return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1) + + def test_numeric_compat(self): + + idx = self._holder(np.arange(5,dtype='int64')) + didx = self._holder(np.arange(5,dtype='int64')**2 + ) + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5,dtype='int64') + tm.assert_index_equal(result, self._holder(np.arange(5,dtype='int64')*5)) + + result = idx * np.arange(5,dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='float64')+0.1) + tm.assert_index_equal(result, + Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) + + + # invalid + self.assertRaises(TypeError, lambda : idx * idx) + self.assertRaises(ValueError, lambda : idx * self._holder(np.arange(3))) + self.assertRaises(ValueError, lambda : idx * np.array([1,2])) + + def test_pickle_compat_construction(self): + pass + class TestMultiIndex(Base, tm.TestCase): _holder = MultiIndex _multiprocess_can_split_ = True @@ -2179,6 +2335,18 @@ def test_from_product_datetimeindex(self): (2, pd.Timestamp('2000-01-02'))]) assert_array_equal(mi.values, etalon) + def test_values_boxed(self): + tuples = [(1, pd.Timestamp('2000-01-01')), + (2, pd.NaT), + (3, pd.Timestamp('2000-01-03')), + (1, pd.Timestamp('2000-01-04')), + (2, pd.Timestamp('2000-01-02')), + (3, pd.Timestamp('2000-01-03'))] + mi = pd.MultiIndex.from_tuples(tuples) + assert_array_equal(mi.values, pd.lib.list_to_object_array(tuples)) + # Check that code branches for boxed values produce identical results + assert_array_equal(mi.values[:4], mi[:4].values) + def test_append(self): result = self.index[:3].append(self.index[3:]) self.assertTrue(result.equals(self.index)) @@ -2288,6 +2456,14 @@ def test_legacy_v2_unpickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) + def test_roundtrip_pickle_with_tz(self): + + # GH 8367 + # round-trip of timezone + index=MultiIndex.from_product([[1,2],['a','b'],date_range('20130101',periods=3,tz='US/Eastern')],names=['one','two','three']) + unpickled = self.round_trip_pickle(index) + self.assertTrue(index.equal_levels(unpickled)) + def test_from_tuples_index_values(self): result = MultiIndex.from_tuples(self.index) self.assertTrue((result.values == self.index.values).all()) @@ -2580,7 +2756,6 @@ def test_format_sparse_display(self): self.assertEqual(result[3], '1 0 0 0') def test_format_sparse_config(self): - import warnings warn_filters = warnings.filters warnings.filterwarnings('ignore', category=FutureWarning, @@ -2775,9 +2950,15 @@ def test_intersection(self): # result = self.index & tuples # self.assertTrue(result.equals(tuples)) - def test_diff(self): + def test_difference(self): + first = self.index - result = first - self.index[-3:] + result = first.difference(self.index[-3:]) + + # - API change GH 8226 + with tm.assert_produces_warning(): + first - self.index[-3:] + expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), sortorder=0, names=self.index.names) @@ -2787,19 +2968,19 @@ def test_diff(self): self.assertEqual(result.names, self.index.names) # empty difference: reflexive - result = self.index - self.index + result = self.index.difference(self.index) expected = self.index[:0] self.assertTrue(result.equals(expected)) self.assertEqual(result.names, self.index.names) # empty difference: superset - result = self.index[-3:] - self.index + result = self.index[-3:].difference(self.index) expected = self.index[:0] self.assertTrue(result.equals(expected)) self.assertEqual(result.names, self.index.names) # empty difference: degenerate - result = self.index[:0] - self.index + result = self.index[:0].difference(self.index) expected = self.index[:0] self.assertTrue(result.equals(expected)) self.assertEqual(result.names, self.index.names) @@ -2807,31 +2988,31 @@ def test_diff(self): # names not the same chunklet = self.index[-3:] chunklet.names = ['foo', 'baz'] - result = first - chunklet + result = first.difference(chunklet) self.assertEqual(result.names, (None, None)) # empty, but non-equal - result = self.index - self.index.sortlevel(1)[0] + result = self.index.difference(self.index.sortlevel(1)[0]) self.assertEqual(len(result), 0) # raise Exception called with non-MultiIndex - result = first.diff(first._tuple_index) + result = first.difference(first._tuple_index) self.assertTrue(result.equals(first[:0])) # name from empty array - result = first.diff([]) + result = first.difference([]) self.assertTrue(first.equals(result)) self.assertEqual(first.names, result.names) # name from non-empty array - result = first.diff([('foo', 'one')]) + result = first.difference([('foo', 'one')]) expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]) expected.names = first.names self.assertEqual(first.names, result.names) assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list" - " of tuples", first.diff, [1, 2, 3, 4, 5]) + " of tuples", first.difference, [1, 2, 3, 4, 5]) def test_from_tuples(self): assertRaisesRegexp(TypeError, 'Cannot infer number of levels from' @@ -3171,6 +3352,45 @@ def test_isin_level_kwarg(self): self.assertRaises(KeyError, idx.isin, vals_1, level='C') + def test_reindex_preserves_names_when_target_is_list_or_ndarray(self): + # GH6552 + idx = self.index.copy() + target = idx.copy() + idx.names = target.names = [None, None] + + other_dtype = pd.MultiIndex.from_product([[1, 2], [3, 4]]) + + # list & ndarray cases + self.assertEqual(idx.reindex([])[0].names, [None, None]) + self.assertEqual(idx.reindex(np.array([]))[0].names, [None, None]) + self.assertEqual(idx.reindex(target.tolist())[0].names, [None, None]) + self.assertEqual(idx.reindex(target.values)[0].names, [None, None]) + self.assertEqual(idx.reindex(other_dtype.tolist())[0].names, [None, None]) + self.assertEqual(idx.reindex(other_dtype.values)[0].names, [None, None]) + + idx.names = ['foo', 'bar'] + self.assertEqual(idx.reindex([])[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex(np.array([]))[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex(target.tolist())[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex(target.values)[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex(other_dtype.tolist())[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex(other_dtype.values)[0].names, ['foo', 'bar']) + + def test_reindex_lvl_preserves_names_when_target_is_list_or_array(self): + # GH7774 + idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']], + names=['foo', 'bar']) + self.assertEqual(idx.reindex([], level=0)[0].names, ['foo', 'bar']) + self.assertEqual(idx.reindex([], level=1)[0].names, ['foo', 'bar']) + + def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(self): + # GH7774 + idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + self.assertEqual(idx.reindex([], level=0)[0].levels[0].dtype.type, + np.int64) + self.assertEqual(idx.reindex([], level=1)[0].levels[1].dtype.type, + np.object_) + def test_get_combined_index(): from pandas.core.index import _get_combined_index diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e7bb716de60f3..4eb06db57b054 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2,8 +2,10 @@ import nose import itertools import warnings +from datetime import datetime from pandas.compat import range, lrange, lzip, StringIO, lmap, map +from pandas.tslib import NaT from numpy import nan from numpy.random import randn import numpy as np @@ -12,9 +14,10 @@ import pandas.core.common as com from pandas import option_context from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, - MultiIndex, Float64Index, Timestamp) + MultiIndex, Float64Index, Timestamp, Timedelta) from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal) + assert_frame_equal, assert_panel_equal, + assert_attr_equal) from pandas import concat import pandas.util.testing as tm @@ -319,7 +322,7 @@ def _check(f, func, values = False): _check(d['ts'], 'at') _check(d['floats'],'at') - def test_at_timestamp(self): + def test_at_iat_coercion(self): # as timestamp is not a tuple! dates = date_range('1/1/2000', periods=8) @@ -330,6 +333,22 @@ def test_at_timestamp(self): xp = s.values[5] self.assertEqual(result, xp) + # GH 7729 + # make sure we are boxing the returns + s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]') + expected = Timestamp('2014-02-02') + + for r in [ lambda : s.iat[1], lambda : s.iloc[1] ]: + result = r() + self.assertEqual(result, expected) + + s = Series(['1 days','2 days'], dtype='timedelta64[ns]') + expected = Timedelta('2 days') + + for r in [ lambda : s.iat[1], lambda : s.iloc[1] ]: + result = r() + self.assertEqual(result, expected) + def test_iat_invalid_args(self): pass @@ -847,6 +866,28 @@ def f(): df.loc[[3]] self.assertRaises(KeyError, f) + # at should not fallback + # GH 7814 + s = Series([1,2,3], index=list('abc')) + result = s.at['a'] + self.assertEquals(result, 1) + self.assertRaises(ValueError, lambda : s.at[0]) + + df = DataFrame({'A' : [1,2,3]},index=list('abc')) + result = df.at['a','A'] + self.assertEquals(result, 1) + self.assertRaises(ValueError, lambda : df.at['a',0]) + + s = Series([1,2,3], index=[3,2,1]) + result = s.at[1] + self.assertEquals(result, 3) + self.assertRaises(ValueError, lambda : s.at['a']) + + df = DataFrame({0 : [1,2,3]},index=[3,2,1]) + result = df.at[1,0] + self.assertEquals(result, 3) + self.assertRaises(ValueError, lambda : df.at['a',0]) + def test_loc_getitem_label_slice(self): # label slices (with ints) @@ -990,11 +1031,12 @@ def test_loc_setitem_frame(self): def test_loc_setitem_frame_multiples(self): - # multiple setting df = DataFrame({ 'A' : ['foo','bar','baz'], 'B' : Series(range(3),dtype=np.int64) }) - df.loc[0:1] = df.loc[1:2] + rhs = df.loc[1:2] + rhs.index = df.index[0:2] + df.loc[0:1] = rhs expected = DataFrame({ 'A' : ['bar','baz','baz'], 'B' : Series([1,2,2],dtype=np.int64) }) assert_frame_equal(df, expected) @@ -1006,8 +1048,9 @@ def test_loc_setitem_frame_multiples(self): expected = DataFrame({ 'date' : [Timestamp('20000101'),Timestamp('20000102'),Timestamp('20000101'), Timestamp('20000102'),Timestamp('20000103')], 'val' : Series([0,1,0,1,2],dtype=np.int64) }) - - df.loc[2:4] = df.loc[0:2] + rhs = df.loc[0:2] + rhs.index = df.index[2:5] + df.loc[2:4] = rhs assert_frame_equal(df, expected) def test_iloc_getitem_frame(self): @@ -1661,6 +1704,76 @@ def test_multiindex_slicers_datetimelike(self): result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'],1), idx['A','B']] assert_frame_equal(result,expected) + + def test_multiindex_slicers_edges(self): + + # GH 8132 + # various edge cases + df = DataFrame({'A': ['A0'] * 5 + ['A1']*5 + ['A2']*5, + 'B': ['B0','B0','B1','B1','B2'] * 3, + 'DATE': ["2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-09-03", + "2013-10-01", + "2013-07-09", + "2013-08-06", + "2013-09-03"], + 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3,4, 2]}) + + df['DATE'] = pd.to_datetime(df['DATE']) + df1 = df.set_index(['A', 'B', 'DATE']) + df1 = df1.sortlevel() + df2 = df.set_index('DATE') + + # A1 - Get all values under "A0" and "A1" + result = df1.loc[(slice('A1')),:] + expected = df1.iloc[0:10] + assert_frame_equal(result, expected) + + # A2 - Get all values from the start to "A2" + result = df1.loc[(slice('A2')),:] + expected = df1 + assert_frame_equal(result, expected) + + # A3 - Get all values under "B1" or "B2" + result = df1.loc[(slice(None),slice('B1','B2')),:] + expected = df1.iloc[[2,3,4,7,8,9,12,13,14]] + assert_frame_equal(result, expected) + + # A4 - Get all values between 2013-07-02 and 2013-07-09 + result = df1.loc[(slice(None),slice(None),slice('20130702','20130709')),:] + expected = df1.iloc[[1,2,6,7,12]] + assert_frame_equal(result, expected) + + # B1 - Get all values in B0 that are also under A0, A1 and A2 + result = df1.loc[(slice('A2'),slice('B0')),:] + expected = df1.iloc[[0,1,5,6,10,11]] + assert_frame_equal(result, expected) + + # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for the As) + result = df1.loc[(slice(None),slice('B2')),:] + expected = df1 + assert_frame_equal(result, expected) + + # B3 - Get all values from B1 to B2 and up to 2013-08-06 + result = df1.loc[(slice(None),slice('B1','B2'),slice('2013-08-06')),:] + expected = df1.iloc[[2,3,4,7,8,9,12,13]] + assert_frame_equal(result, expected) + + # B4 - Same as A4 but the start of the date slice is not a key. + # shows indexing on a partial selection slice + result = df1.loc[(slice(None),slice(None),slice('20130701','20130709')),:] + expected = df1.iloc[[1,2,6,7,12]] + assert_frame_equal(result, expected) + def test_per_axis_per_level_doc_examples(self): # test index maker @@ -2180,6 +2293,33 @@ def test_panel_getitem(self): test1 = panel.ix[:, "2002"] tm.assert_panel_equal(test1,test2) + def test_panel_setitem(self): + + # GH 7763 + # loc and setitem have setting differences + np.random.seed(0) + index=range(3) + columns = list('abc') + + panel = Panel({'A' : DataFrame(np.random.randn(3, 3), index=index, columns=columns), + 'B' : DataFrame(np.random.randn(3, 3), index=index, columns=columns), + 'C' : DataFrame(np.random.randn(3, 3), index=index, columns=columns) + }) + + replace = DataFrame(np.eye(3,3), index=range(3), columns=columns) + expected = Panel({ 'A' : replace, 'B' : replace, 'C' : replace }) + + p = panel.copy() + for idx in list('ABC'): + p[idx] = replace + tm.assert_panel_equal(p, expected) + + p = panel.copy() + for idx in list('ABC'): + p.loc[idx,:,:] = replace + tm.assert_panel_equal(p, expected) + + def test_panel_assignment(self): # GH3777 @@ -2739,7 +2879,8 @@ def f(): df.loc[1] = df.loc[2] assert_frame_equal(df,expected) - expected = DataFrame(dict({ 'A' : [0,2,4,4], 'B' : [1,3,5,5] }),dtype='float64') + # like 2578, partial setting with dtype preservation + expected = DataFrame(dict({ 'A' : [0,2,4,4], 'B' : [1,3,5,5] })) df = df_orig.copy() df.loc[3] = df.loc[2] assert_frame_equal(df,expected) @@ -2791,6 +2932,61 @@ def f(): p.loc[:,:,'C'] = Series([30,32],index=p_orig.items) assert_panel_equal(p,expected) + # GH 8473 + dates = date_range('1/1/2000', periods=8) + df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + + expected = pd.concat([df_orig,DataFrame({'A' : 7},index=[dates[-1]+1])]) + df = df_orig.copy() + df.loc[dates[-1]+1, 'A'] = 7 + assert_frame_equal(df,expected) + df = df_orig.copy() + df.at[dates[-1]+1, 'A'] = 7 + assert_frame_equal(df,expected) + + expected = pd.concat([df_orig,DataFrame({0 : 7},index=[dates[-1]+1])],axis=1) + df = df_orig.copy() + df.loc[dates[-1]+1, 0] = 7 + assert_frame_equal(df,expected) + df = df_orig.copy() + df.at[dates[-1]+1, 0] = 7 + assert_frame_equal(df,expected) + + def test_partial_setting_mixed_dtype(self): + + # in a mixed dtype environment, try to preserve dtypes + # by appending + df = DataFrame([[True, 1],[False, 2]], + columns = ["female","fitness"]) + + s = df.loc[1].copy() + s.name = 2 + expected = df.append(s) + + df.loc[2] = df.loc[1] + assert_frame_equal(df, expected) + + # columns will align + df = DataFrame(columns=['A','B']) + df.loc[0] = Series(1,index=range(4)) + assert_frame_equal(df,DataFrame(columns=['A','B'],index=[0])) + + # columns will align + df = DataFrame(columns=['A','B']) + df.loc[0] = Series(1,index=['B']) + assert_frame_equal(df,DataFrame([[np.nan, 1]], columns=['A','B'],index=[0],dtype='float64')) + + # list-like must conform + df = DataFrame(columns=['A','B']) + def f(): + df.loc[0] = [1,2,3] + self.assertRaises(ValueError, f) + + # these are coerced to float unavoidably (as its a list-like to begin) + df = DataFrame(columns=['A','B']) + df.loc[3] = [6,7] + assert_frame_equal(df,DataFrame([[6,7]],index=[3],columns=['A','B'],dtype='float64')) + def test_series_partial_set(self): # partial set with new index # Regression from GH4825 @@ -2940,15 +3136,6 @@ def f(): assert_frame_equal(df,DataFrame([[1]],index=['foo'],columns=[1])) assert_frame_equal(df,df2) - df = DataFrame(columns=['A','B']) - df.loc[3] = [6,7] - assert_frame_equal(df,DataFrame([[6,7]],index=[3],columns=['A','B'])) - - # no label overlap - df = DataFrame(columns=['A','B']) - df.loc[0] = Series(1,index=range(4)) - assert_frame_equal(df,DataFrame(columns=['A','B'],index=[0])) - # no index to start expected = DataFrame({ 0 : Series(1,index=range(4)) },columns=['A','B',0]) @@ -3665,26 +3852,41 @@ def test_set_ix_out_of_bounds_axis_1(self): def test_iloc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) - assert_frame_equal(df.iloc[:,[]], df.iloc[:, :0]) # vertical empty - assert_frame_equal(df.iloc[[],:], df.iloc[:0, :]) # horizontal empty - assert_frame_equal(df.iloc[[]], df.iloc[:0, :]) # horizontal empty + # vertical empty + assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.iloc[[]], df.iloc[:0, :], + check_index_type=True, check_column_type=True) - # FIXME: fix loc & xs def test_loc_empty_list_indexer_is_ok(self): - raise nose.SkipTest('loc discards columns names') from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) - assert_frame_equal(df.loc[:,[]], df.iloc[:, :0]) # vertical empty - assert_frame_equal(df.loc[[],:], df.iloc[:0, :]) # horizontal empty - assert_frame_equal(df.loc[[]], df.iloc[:0, :]) # horizontal empty + # vertical empty + assert_frame_equal(df.loc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.loc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.loc[[]], df.iloc[:0, :], + check_index_type=True, check_column_type=True) def test_ix_empty_list_indexer_is_ok(self): - raise nose.SkipTest('ix discards columns names') from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) - assert_frame_equal(df.ix[:,[]], df.iloc[:, :0]) # vertical empty - assert_frame_equal(df.ix[[],:], df.iloc[:0, :]) # horizontal empty - assert_frame_equal(df.ix[[]], df.iloc[:0, :]) # horizontal empty + # vertical empty + assert_frame_equal(df.ix[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.ix[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) + # horizontal empty + assert_frame_equal(df.ix[[]], df.iloc[:0, :], + check_index_type=True, check_column_type=True) def test_deprecate_float_indexers(self): @@ -3815,6 +4017,194 @@ def test_float_index_non_scalar_assignment(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df,df2) + def test_float_index_at_iat(self): + s = pd.Series([1, 2, 3], index=[0.1, 0.2, 0.3]) + for el, item in s.iteritems(): + self.assertEqual(s.at[el], item) + for i in range(len(s)): + self.assertEqual(s.iat[i], i + 1) + + def test_rhs_alignment(self): + # GH8258, tests that both rows & columns are aligned to what is + # assigned to. covers both uniform data-type & multi-type cases + def run_tests(df, rhs, right): + # label, index, slice + r, i, s = list('bcd'), [1, 2, 3], slice(1, 4) + c, j, l = ['joe', 'jolie'], [1, 2], slice(1, 3) + + left = df.copy() + left.loc[r, c] = rhs + assert_frame_equal(left, right) + + left = df.copy() + left.iloc[i, j] = rhs + assert_frame_equal(left, right) + + left = df.copy() + left.ix[s, l] = rhs + assert_frame_equal(left, right) + + left = df.copy() + left.ix[i, j] = rhs + assert_frame_equal(left, right) + + left = df.copy() + left.ix[r, c] = rhs + assert_frame_equal(left, right) + + xs = np.arange(20).reshape(5, 4) + cols = ['jim', 'joe', 'jolie', 'joline'] + df = pd.DataFrame(xs, columns=cols, index=list('abcde')) + + # right hand side; permute the indices and multiplpy by -2 + rhs = - 2 * df.iloc[3:0:-1, 2:0:-1] + + # expected `right` result; just multiply by -2 + right = df.copy() + right.iloc[1:4, 1:3] *= -2 + + # run tests with uniform dtypes + run_tests(df, rhs, right) + + # make frames multi-type & re-run tests + for frame in [df, rhs, right]: + frame['joe'] = frame['joe'].astype('float64') + frame['jolie'] = frame['jolie'].map('@{0}'.format) + + run_tests(df, rhs, right) + +class TestSeriesNoneCoercion(tm.TestCase): + EXPECTED_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + + # For datetime series, we should coerce to NaT. + ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), + + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[0] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_loc_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[0] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_setitem_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_loc_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + +class TestDataframeNoneCoercion(tm.TestCase): + EXPECTED_SINGLE_ROW_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + + # For datetime series, we should coerce to NaT. + ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), + + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_loc(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe.loc[0, ['foo']] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_coercion_with_setitem_and_dataframe(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][0]] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_none_coercion_loc_and_dataframe(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe.loc[start_dataframe['foo'] == start_dataframe['foo'][0]] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_none_coercion_mixed_dtypes(self): + start_dataframe = DataFrame({ + 'a': [1, 2, 3], + 'b': [1.0, 2.0, 3.0], + 'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + 'd': ['a', 'b', 'c']}) + start_dataframe.iloc[0] = None + + expected_dataframe = DataFrame({ + 'a': [np.nan, 2, 3], + 'b': [np.nan, 2.0, 3.0], + 'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + 'd': [None, 'b', 'c']}) + + for column in expected_dataframe.columns: + assert_attr_equal('dtype', start_dataframe[column], expected_dataframe[column]) + self.assert_numpy_array_equivalent( + start_dataframe[column].values, + expected_dataframe[column].values, strict_nan=True) if __name__ == '__main__': diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index a523df4cc2461..5bc7558efb471 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm import pandas as pd from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, randn) + assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u @@ -363,6 +363,15 @@ def test_non_unique_pickle(self): mgr2 = self.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + def test_categorical_block_pickle(self): + mgr = create_mgr('a: category') + mgr2 = self.round_trip_pickle(mgr) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + smgr = create_single_mgr('category') + smgr2 = self.round_trip_pickle(smgr) + assert_series_equal(Series(smgr), Series(smgr2)) + def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a0c5d3ce5959a..2171b8e8428a4 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,12 +1,13 @@ # pylint: disable-msg=W0612,E1101,W0141 import datetime +import itertools import nose from numpy.random import randn import numpy as np from pandas.core.index import Index, MultiIndex -from pandas import Panel, DataFrame, Series, notnull, isnull +from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -213,6 +214,44 @@ def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEqual(result.index.names, self.frame.index.names) + def test_sorting_repr_8017(self): + + np.random.seed(0) + data = np.random.randn(3,4) + + for gen, extra in [([1.,3.,2.,5.],4.), + ([1,3,2,5],4), + ([Timestamp('20130101'),Timestamp('20130103'),Timestamp('20130102'),Timestamp('20130105')],Timestamp('20130104')), + (['1one','3one','2one','5one'],'4one')]: + columns = MultiIndex.from_tuples([('red', i) for i in gen]) + df = DataFrame(data, index=list('def'), columns=columns) + df2 = pd.concat([df,DataFrame('world', + index=list('def'), + columns=MultiIndex.from_tuples([('red', extra)]))],axis=1) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + self.assertEqual(str(df2).splitlines()[0].split(),['red']) + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:,[0,2,1,3]] + assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:,[0,2,1,4,3]] + assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[('red',extra)] = 'world' + result = result.sort_index(axis=1) + assert_frame_equal(result, expected) + def test_repr_to_string(self): repr(self.frame) repr(self.ymd) @@ -2066,6 +2105,17 @@ def test_datetimeindex(self): self.assertTrue(idx.levels[0].equals(expected1)) self.assertTrue(idx.levels[1].equals(idx2)) + # from datetime combos + # GH 7888 + date1 = datetime.date.today() + date2 = datetime.datetime.today() + date3 = Timestamp.today() + + for d1, d2 in itertools.product([date1,date2,date3],[date1,date2,date3]): + index = pd.MultiIndex.from_product([[d1],[d2]]) + self.assertIsInstance(index.levels[0],pd.DatetimeIndex) + self.assertIsInstance(index.levels[1],pd.DatetimeIndex) + def test_set_index_datetime(self): # GH 3950 df = pd.DataFrame({'label':['a', 'a', 'a', 'b', 'b', 'b'], diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 3e8a5fecbb579..3ec00fee1d151 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -118,11 +118,32 @@ def setUp(self): def check_results(self, targ, res, axis): res = getattr(res, 'asm8', res) res = getattr(res, 'values', res) - if axis != 0 and hasattr(targ, 'shape') and targ.ndim: - res = np.split(res, [targ.shape[0]], axis=0)[0] + + # timedeltas are a beast here + def _coerce_tds(targ, res): + if targ.dtype == 'm8[ns]': + if len(targ) == 1: + targ = targ[0].item() + res = res.item() + else: + targ = targ.view('i8') + return targ, res + + try: + if axis != 0 and hasattr(targ, 'shape') and targ.ndim: + res = np.split(res, [targ.shape[0]], axis=0)[0] + except: + targ, res = _coerce_tds(targ, res) + try: tm.assert_almost_equal(targ, res) except: + + if targ.dtype == 'm8[ns]': + targ, res = _coerce_tds(targ, res) + tm.assert_almost_equal(targ, res) + return + # There are sometimes rounding errors with # complex and object dtypes. # If it isn't one of those, re-raise the error. @@ -208,7 +229,7 @@ def check_fun(self, testfunc, targfunc, def check_funs(self, testfunc, targfunc, allow_complex=True, allow_all_nan=True, allow_str=True, - allow_date=True, allow_obj=True, + allow_date=True, allow_tdelta=True, allow_obj=True, **kwargs): self.check_fun(testfunc, targfunc, 'arr_float', **kwargs) self.check_fun(testfunc, targfunc, 'arr_float_nan', 'arr_float', @@ -244,6 +265,8 @@ def check_funs(self, testfunc, targfunc, else: self.check_fun(testfunc, targfunc, 'arr_date', **kwargs) objs += [self.arr_date.astype('O')] + + if allow_tdelta: try: targfunc(self.arr_tdelta) except TypeError: @@ -264,12 +287,12 @@ def check_funs(self, testfunc, targfunc, def check_funs_ddof(self, testfunc, targfunc, allow_complex=True, allow_all_nan=True, allow_str=True, - allow_date=True, allow_obj=True,): + allow_date=False, allow_tdelta=False, allow_obj=True,): for ddof in range(3): try: self.check_funs(self, testfunc, targfunc, allow_complex, allow_all_nan, allow_str, - allow_date, allow_obj, + allow_date, allow_tdelta, allow_obj, ddof=ddof) except BaseException as exc: exc.args += ('ddof %s' % ddof,) @@ -284,34 +307,39 @@ def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): def test_nanany(self): self.check_funs(nanops.nanany, np.any, - allow_all_nan=False, allow_str=False, allow_date=False) + allow_all_nan=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nanall(self): self.check_funs(nanops.nanall, np.all, - allow_all_nan=False, allow_str=False, allow_date=False) + allow_all_nan=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nansum(self): self.check_funs(nanops.nansum, np.sum, - allow_str=False, allow_date=False) + allow_str=False, allow_date=False, allow_tdelta=True) def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, allow_obj=False, - allow_str=False, allow_date=False) + allow_str=False, allow_date=False, allow_tdelta=True) def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, allow_complex=False, allow_str=False, allow_date=False, + allow_tdelta=True, allow_obj='convert') def test_nanvar(self): self.check_funs_ddof(nanops.nanvar, np.var, - allow_complex=False, allow_date=False) + allow_complex=False, allow_date=False, allow_tdelta=False) + + def test_nanstd(self): + self.check_funs_ddof(nanops.nanstd, np.std, + allow_complex=False, allow_date=False, allow_tdelta=True) def test_nansem(self): tm.skip_if_no_package('scipy.stats') self.check_funs_ddof(nanops.nansem, np.var, - allow_complex=False, allow_date=False) + allow_complex=False, allow_date=False, allow_tdelta=False) def _minmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -343,13 +371,16 @@ def _argminmax_wrap(self, value, axis=None, func=None): def test_nanargmax(self): func = partial(self._argminmax_wrap, func=np.argmax) self.check_funs(nanops.nanargmax, func, - allow_str=False, allow_obj=False) + allow_str=False, allow_obj=False, + allow_date=True, + allow_tdelta=True) def test_nanargmin(self): func = partial(self._argminmax_wrap, func=np.argmin) if tm.sys.version_info[0:2] == (2, 6): self.check_funs(nanops.nanargmin, func, - allow_date=False, + allow_date=True, + allow_tdelta=True, allow_str=False, allow_obj=False) else: self.check_funs(nanops.nanargmin, func, @@ -372,7 +403,7 @@ def test_nanskew(self): from scipy.stats import skew func = partial(self._skew_kurt_wrap, func=skew) self.check_funs(nanops.nanskew, func, - allow_complex=False, allow_str=False, allow_date=False) + allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nankurt(self): tm.skip_if_no_package('scipy.stats') @@ -380,11 +411,11 @@ def test_nankurt(self): func1 = partial(kurtosis, fisher=True) func = partial(self._skew_kurt_wrap, func=func1) self.check_funs(nanops.nankurt, func, - allow_complex=False, allow_str=False, allow_date=False) + allow_complex=False, allow_str=False, allow_date=False, allow_tdelta=False) def test_nanprod(self): self.check_funs(nanops.nanprod, np.prod, - allow_str=False, allow_date=False) + allow_str=False, allow_date=False, allow_tdelta=False) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index fb1f1c1693fdd..736cdf312b361 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -18,6 +18,7 @@ assert_frame_equal, assert_series_equal, assert_almost_equal, + assert_produces_warning, ensure_clean, assertRaisesRegexp, makeCustomDataframe as mkdf, @@ -667,24 +668,42 @@ def test_ix_align(self): def test_ix_frame_align(self): from pandas import DataFrame - df = DataFrame(np.random.randn(2, 10)) - df.sort_index(inplace=True) - p_orig = Panel(np.random.randn(3, 10, 2)) + p_orig = tm.makePanel() + df = p_orig.ix[0].copy() + assert_frame_equal(p_orig['ItemA'],df) p = p_orig.copy() p.ix[0, :, :] = df - out = p.ix[0, :, :].T.reindex(df.index, columns=df.columns) - assert_frame_equal(out, df) + assert_panel_equal(p, p_orig) p = p_orig.copy() p.ix[0] = df - out = p.ix[0].T.reindex(df.index, columns=df.columns) - assert_frame_equal(out, df) + assert_panel_equal(p, p_orig) + + p = p_orig.copy() + p.iloc[0, :, :] = df + assert_panel_equal(p, p_orig) + + p = p_orig.copy() + p.iloc[0] = df + assert_panel_equal(p, p_orig) + + p = p_orig.copy() + p.loc['ItemA'] = df + assert_panel_equal(p, p_orig) + + p = p_orig.copy() + p.loc['ItemA', :, :] = df + assert_panel_equal(p, p_orig) + + p = p_orig.copy() + p['ItemA'] = df + assert_panel_equal(p, p_orig) p = p_orig.copy() p.ix[0, [0, 1, 3, 5], -2:] = df out = p.ix[0, [0, 1, 3, 5], -2:] - assert_frame_equal(out, df.T.reindex([0, 1, 3, 5], p.minor_axis[-2:])) + assert_frame_equal(out, df.iloc[[0,1,3,5],[2,3]]) # GH3830, panel assignent by values/frame for dtype in ['float64','int64']: @@ -2324,6 +2343,13 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)]) assert(index.equals(expected)) + +def test_import_warnings(): + # GH8152 + panel = Panel(np.random.rand(3, 3, 3)) + with assert_produces_warning(): + panel.major_xs(1, copy=False) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 42427617991af..3cc2d94789a8d 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -149,6 +149,11 @@ def test_multiindex(self): class TestGetDummies(tm.TestCase): + + def setUp(self): + self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + def test_basic(self): s_list = list('abc') s_series = Series(s_list) @@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp) + def test_dataframe_dummies_all_obj(self): + df = self.df[['A', 'B']] + result = get_dummies(df) + expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], + 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_mix_default(self): + df = self.df + result = get_dummies(df) + expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], + 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], + 'B_c': [0., 0, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_list(self): + prefixes = ['from_A', 'from_B'] + df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + result = get_dummies(df, prefix=prefixes) + expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1], + 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], + 'from_B_c': [0., 0, 1]}) + expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', + 'from_B_c']] + assert_frame_equal(result, expected) + + def test_datafrmae_dummies_prefix_str(self): + # not that you should do this... + df = self.df + result = get_dummies(df, prefix='bad') + expected = DataFrame([[1, 1., 0., 1., 0.], + [2, 0., 1., 1., 0.], + [3, 1., 0., 0., 1.]], + columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_subset(self): + df = self.df + result = get_dummies(df, prefix=['from_A'], + columns=['A']) + expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], + 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_sep(self): + df = self.df + result = get_dummies(df, prefix_sep='..') + expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], + 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], + 'B..c': [0., 0, 1]}) + expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep=['..', '__']) + expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) + assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_bad_length(self): + with tm.assertRaises(ValueError): + get_dummies(self.df, prefix=['too few']) + + def test_dataframe_dummies_prefix_sep_bad_length(self): + with tm.assertRaises(ValueError): + get_dummies(self.df, prefix_sep=['bad']) + + def test_dataframe_dummies_prefix_dict(self): + prefixes = {'A': 'from_A', 'B': 'from_B'} + df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + result = get_dummies(df, prefix=prefixes) + expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], + 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], + 'C': [1, 2, 3]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_na(self): + df = self.df + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies(df, dummy_na=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], + 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], + 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', + 'B_nan']] + assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_categorical(self): + df = self.df + df['cat'] = pd.Categorical(['x', 'y', 'y']) + result = get_dummies(df) + expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], + 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], + 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], + 'cat_y': [0., 1, 1]}) + expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', + 'cat_x', 'cat_y']] + assert_frame_equal(result, expected) + + class TestConvertDummies(tm.TestCase): def test_convert_dummies(self): df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -218,8 +331,9 @@ def test_convert_dummies(self): 'C': np.random.randn(8), 'D': np.random.randn(8)}) - result = convert_dummies(df, ['A', 'B']) - result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') + with tm.assert_produces_warning(FutureWarning): + result = convert_dummies(df, ['A', 'B']) + result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1], 'A_bar': [0, 1, 0, 1, 0, 1, 0, 0], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 022a8b543ce32..2d3961a643991 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -16,10 +16,12 @@ import pandas as pd from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, - date_range, period_range) + date_range, period_range, timedelta_range) from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError +from pandas.tseries.period import PeriodIndex from pandas.tseries.index import Timestamp, DatetimeIndex +from pandas.tseries.tdi import Timedelta, TimedeltaIndex import pandas.core.common as com import pandas.core.config as cf import pandas.lib as lib @@ -76,18 +78,31 @@ def test_dt_namespace_accessor(self): # GH 7207 # test .dt namespace accessor - ok_for_base = ['year','month','day','hour','minute','second','weekofyear','week','dayofweek','weekday','dayofyear','quarter'] + ok_for_base = ['year','month','day','hour','minute','second','weekofyear','week','dayofweek','weekday','dayofyear','quarter','freq'] ok_for_period = ok_for_base + ['qyear'] ok_for_dt = ok_for_base + ['date','time','microsecond','nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', - 'is_quarter_end', 'is_year_start', 'is_year_end'] + 'is_quarter_end', 'is_year_start', 'is_year_end', 'tz'] + ok_for_dt_methods = ['to_period','to_pydatetime','tz_localize','tz_convert'] + ok_for_td = ['days','hours','minutes','seconds','milliseconds','microseconds','nanoseconds'] + ok_for_td_methods = ['components','to_pytimedelta'] def get_expected(s, name): result = getattr(Index(s.values),prop) if isinstance(result, np.ndarray): if com.is_integer_dtype(result): result = result.astype('int64') + elif not com.is_list_like(result): + return result return Series(result,index=s.index) + def compare(s, name): + a = getattr(s.dt,prop) + b = get_expected(s,prop) + if not (com.is_list_like(a) and com.is_list_like(b)): + self.assertEqual(a,b) + else: + tm.assert_series_equal(a,b) + # invalids for s in [Series(np.arange(5)), Series(list('abcde')), @@ -98,9 +113,56 @@ def get_expected(s, name): for s in [Series(date_range('20130101',periods=5)), Series(date_range('20130101',periods=5,freq='s')), Series(date_range('20130101 00:00:00',periods=5,freq='ms'))]: - for prop in ok_for_dt: - tm.assert_series_equal(getattr(s.dt,prop),get_expected(s,prop)) + + # we test freq below + if prop != 'freq': + compare(s, prop) + + for prop in ok_for_dt_methods: + getattr(s.dt,prop) + + result = s.dt.to_pydatetime() + self.assertIsInstance(result,np.ndarray) + self.assertTrue(result.dtype == object) + + result = s.dt.tz_localize('US/Eastern') + expected = Series(DatetimeIndex(s.values).tz_localize('US/Eastern'),index=s.index) + tm.assert_series_equal(result, expected) + + tz_result = result.dt.tz + self.assertEqual(str(tz_result), 'US/Eastern') + freq_result = s.dt.freq + self.assertEqual(freq_result, DatetimeIndex(s.values, freq='infer').freq) + + # let's localize, then convert + result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + expected = Series(DatetimeIndex(s.values).tz_localize('UTC').tz_convert('US/Eastern'),index=s.index) + tm.assert_series_equal(result, expected) + + # timedeltaindex + for s in [Series(timedelta_range('1 day',periods=5)), + Series(timedelta_range('1 day 01:23:45',periods=5,freq='s')), + Series(timedelta_range('2 days 01:23:45.012345',periods=5,freq='ms'))]: + for prop in ok_for_td: + + # we test freq below + if prop != 'freq': + compare(s, prop) + + for prop in ok_for_td_methods: + getattr(s.dt,prop) + + result = s.dt.components + self.assertIsInstance(result,DataFrame) + tm.assert_index_equal(result.index,s.index) + + result = s.dt.to_pytimedelta() + self.assertIsInstance(result,np.ndarray) + self.assertTrue(result.dtype == object) + + freq_result = s.dt.freq + self.assertEqual(freq_result, TimedeltaIndex(s.values, freq='infer').freq) # both index = date_range('20130101',periods=3,freq='D') @@ -110,10 +172,16 @@ def get_expected(s, name): tm.assert_series_equal(s.dt.second,Series(np.array([0,1,2],dtype='int64'),index=index)) # periodindex - for s in [Series(period_range('20130101',periods=5,freq='D').asobject)]: + for s in [Series(period_range('20130101',periods=5,freq='D'))]: for prop in ok_for_period: - tm.assert_series_equal(getattr(s.dt,prop),get_expected(s,prop)) + + # we test freq below + if prop != 'freq': + compare(s, prop) + + freq_result = s.dt.freq + self.assertEqual(freq_result, PeriodIndex(s.values).freq) # test limited display api def get_dir(s): @@ -122,7 +190,7 @@ def get_dir(s): s = Series(date_range('20130101',periods=5,freq='D')) results = get_dir(s) - tm.assert_almost_equal(results,list(sorted(set(ok_for_dt)))) + tm.assert_almost_equal(results,list(sorted(set(ok_for_dt + ok_for_dt_methods)))) s = Series(period_range('20130101',periods=5,freq='D').asobject) results = get_dir(s) @@ -259,8 +327,7 @@ def test_getitem_setitem_ellipsis(self): self.assertTrue((result == 5).all()) def test_getitem_negative_out_of_bounds(self): - s = Series([tm.rands(5) for _ in range(10)], - index=[tm.rands(10) for _ in range(10)]) + s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) self.assertRaises(IndexError, s.__getitem__, -11) self.assertRaises(IndexError, s.__setitem__, -11, 'foo') @@ -747,6 +814,24 @@ def test_constructor_dtype_datetime64(self): s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) self.assertEqual(s.dtype,'datetime64[ns]') + # tz-aware (UTC and other tz's) + # GH 8411 + dr = date_range('20130101',periods=3) + self.assertTrue(Series(dr).iloc[0].tz is None) + dr = date_range('20130101',periods=3,tz='UTC') + self.assertTrue(str(Series(dr).iloc[0].tz) == 'UTC') + dr = date_range('20130101',periods=3,tz='US/Eastern') + self.assertTrue(str(Series(dr).iloc[0].tz) == 'US/Eastern') + + def test_constructor_periodindex(self): + # GH7932 + # converting a PeriodIndex when put in a Series + + pi = period_range('20130101',periods=5,freq='D') + s = Series(pi) + expected = Series(pi.asobject) + assert_series_equal(s, expected) + def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) @@ -1137,6 +1222,12 @@ def test_getitem_dups(self): result = s['C'] assert_series_equal(result, expected) + def test_getitem_dataframe(self): + rng = list(range(10)) + s = pd.Series(10, index=rng) + df = pd.DataFrame(rng, index=rng) + self.assertRaises(TypeError, s.__getitem__, df>5) + def test_setitem_ambiguous_keyerror(self): s = Series(lrange(10), index=lrange(0, 20, 2)) @@ -1725,6 +1816,13 @@ def test_drop(self): # bad axis self.assertRaises(ValueError, s.drop, 'one', axis='columns') + # GH 8522 + s = Series([2,3], index=[True, False]) + self.assertTrue(s.index.is_object()) + result = s.drop(True) + expected = Series([3],index=[False]) + assert_series_equal(result,expected) + def test_ix_setitem(self): inds = self.series.index[[3, 4, 7]] @@ -2207,7 +2305,7 @@ def testit(): self.series[5:15] = np.NaN # idxmax, idxmin, min, and max are valid for dates - if not ('max' in name or 'min' in name): + if name not in ['max','min']: ds = Series(date_range('1/1/2001', periods=10)) self.assertRaises(TypeError, f, ds) @@ -2299,6 +2397,10 @@ def test_quantile(self): q = tds.quantile(.25) self.assertEqual(q, pd.to_timedelta('24:00:00')) + # GH7661 + result = Series([np.timedelta64('NaT')]).sum() + self.assertTrue(result is pd.NaT) + def test_quantile_multi(self): from numpy import percentile @@ -2811,6 +2913,16 @@ def test_timedelta64_conversions(self): expected = s1.apply(lambda x: np.timedelta64(m,unit) / x) result = np.timedelta64(m,unit) / s1 + # astype + s = Series(date_range('20130101',periods=3)) + result = s.astype(object) + self.assertIsInstance(result.iloc[0],datetime) + self.assertTrue(result.dtype == np.object_) + + result = s1.astype(object) + self.assertIsInstance(result.iloc[0],timedelta) + self.assertTrue(result.dtype == np.object_) + def test_timedelta64_equal_timedelta_supported_ops(self): ser = Series([Timestamp('20130301'), Timestamp('20130228 23:00:00'), Timestamp('20130228 22:00:00'), @@ -2837,6 +2949,22 @@ def timedelta64(*args): raise AssertionError( "invalid comparsion [op->{0},d->{1},h->{2},m->{3},s->{4},us->{5}]\n{6}\n{7}\n".format(op, d, h, m, s, us, lhs, rhs)) + def test_timedelta_assignment(self): + # GH 8209 + s = Series([]) + s.loc['B'] = timedelta(1) + tm.assert_series_equal(s,Series(Timedelta('1 days'),index=['B'])) + + s = s.reindex(s.index.insert(0, 'A')) + tm.assert_series_equal(s,Series([np.nan,Timedelta('1 days')],index=['A','B'])) + + result = s.fillna(timedelta(1)) + expected = Series(Timedelta('1 days'),index=['A','B']) + tm.assert_series_equal(result, expected) + + s.loc['A'] = timedelta(1) + tm.assert_series_equal(s, expected) + def test_operators_datetimelike(self): def run_ops(ops, get_ser, test_ser): @@ -2946,12 +3074,37 @@ def test_timedelta64_functions(self): # max/min result = td.max() - expected = Series([timedelta(2)], dtype='timedelta64[ns]') - assert_series_equal(result, expected) + expected = Timedelta('2 days') + self.assertEqual(result, expected) result = td.min() - expected = Series([timedelta(1)], dtype='timedelta64[ns]') - assert_series_equal(result, expected) + expected = Timedelta('1 days') + self.assertEqual(result, expected) + + def test_ops_consistency_on_empty(self): + + # GH 7869 + # consistency on empty + + # float + result = Series(dtype=float).sum() + self.assertEqual(result,0) + + result = Series(dtype=float).mean() + self.assertTrue(isnull(result)) + + result = Series(dtype=float).median() + self.assertTrue(isnull(result)) + + # timedelta64[ns] + result = Series(dtype='m8[ns]').sum() + self.assertEqual(result, Timedelta(0)) + + result = Series(dtype='m8[ns]').mean() + self.assertTrue(result is pd.NaT) + + result = Series(dtype='m8[ns]').median() + self.assertTrue(result is pd.NaT) def test_timedelta_fillna(self): #GH 3371 @@ -3159,15 +3312,12 @@ def test_bfill(self): assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) def test_sub_of_datetime_from_TimeSeries(self): - from pandas.tseries.timedeltas import _possibly_cast_to_timedelta + from pandas.tseries.timedeltas import to_timedelta from datetime import datetime a = Timestamp(datetime(1993, 0o1, 0o7, 13, 30, 00)) b = datetime(1993, 6, 22, 13, 30) a = Series([a]) - result = _possibly_cast_to_timedelta(np.abs(a - b)) - self.assertEqual(result.dtype, 'timedelta64[ns]') - - result = _possibly_cast_to_timedelta(np.abs(b - a)) + result = to_timedelta(np.abs(a - b)) self.assertEqual(result.dtype, 'timedelta64[ns]') def test_datetime64_with_index(self): @@ -3203,19 +3353,19 @@ def test_timedelta64_nan(self): td1 = td.copy() td1[0] = np.nan self.assertTrue(isnull(td1[0])) - self.assertEqual(td1[0].view('i8'), tslib.iNaT) + self.assertEqual(td1[0].value, tslib.iNaT) td1[0] = td[0] self.assertFalse(isnull(td1[0])) td1[1] = tslib.iNaT self.assertTrue(isnull(td1[1])) - self.assertEqual(td1[1].view('i8'), tslib.iNaT) + self.assertEqual(td1[1].value, tslib.iNaT) td1[1] = td[1] self.assertFalse(isnull(td1[1])) td1[2] = tslib.NaT self.assertTrue(isnull(td1[2])) - self.assertEqual(td1[2].view('i8'), tslib.iNaT) + self.assertEqual(td1[2].value, tslib.iNaT) td1[2] = td[2] self.assertFalse(isnull(td1[2])) @@ -3701,11 +3851,10 @@ def _check_op(arr, op): _check_op(arr, operator.floordiv) def test_series_frame_radd_bug(self): - from pandas.util.testing import rands import operator # GH 353 - vals = Series([rands(5) for _ in range(10)]) + vals = Series(tm.rands_array(5, 10)) result = 'foo_' + vals expected = vals.map(lambda x: 'foo_' + x) assert_series_equal(result, expected) @@ -4371,6 +4520,14 @@ def test_to_csv_list_entries(self): buf = StringIO() split.to_csv(buf) + def test_to_csv_path_is_none(self): + # GH 8215 + # Series.to_csv() was returning None, inconsistent with + # DataFrame.to_csv() which returned string + s = Series([1, 2, 3]) + csv_str = s.to_csv(path=None) + self.assertIsInstance(csv_str, str) + def test_clip(self): val = self.ts.median() @@ -4467,6 +4624,16 @@ def test_shift(self): shifted5 = ps.shift(1, offset=datetools.bday) assert_series_equal(shifted5, shifted4) + # 32-bit taking + # GH 8129 + index=date_range('2000-01-01',periods=5) + for dtype in ['int32','int64']: + s1 = Series(np.arange(5,dtype=dtype),index=index) + p = s1.iloc[1] + result = s1.shift(periods=p) + expected = Series([np.nan,0,1,2,3],index=index) + assert_series_equal(result,expected) + def test_tshift(self): # PeriodIndex ps = tm.makePeriodSeries() @@ -4872,6 +5039,27 @@ def test_cast_on_putmask(self): assert_series_equal(s, expected) + def test_type_promote_putmask(self): + + # GH8387: test that changing types does not break alignment + ts = Series(np.random.randn(100), index=np.arange(100,0,-1)).round(5) + left, mask = ts.copy(), ts > 0 + right = ts[mask].copy().map(str) + left[mask] = right + assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) + + s = Series([0, 1, 2, 0 ]) + mask = s > 0 + s2 = s[ mask ].map( str ) + s[mask] = s2 + assert_series_equal(s, Series([0, '1', '2', 0])) + + s = Series([0, 'foo', 'bar', 0 ]) + mask = Series([False, True, True, False]) + s2 = s[ mask ] + s[mask] = s2 + assert_series_equal(s, Series([0, 'foo','bar', 0])) + def test_astype_cast_nan_int(self): df = Series([1.0, 2.0, 3.0, np.nan]) self.assertRaises(ValueError, df.astype, np.int64) @@ -5182,8 +5370,8 @@ def _check_align(a, b, how='left', fill=None): join_index = a.index.join(b.index, how=how) if fill is not None: - diff_a = aa.index.diff(join_index) - diff_b = ab.index.diff(join_index) + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) if len(diff_a) > 0: self.assertTrue((aa.reindex(diff_a) == fill).all()) if len(diff_b) > 0: @@ -5779,7 +5967,6 @@ def test_replace_with_single_list(self): s.replace([1,2,3],inplace=True,method='crash_cymbal') assert_series_equal(s, ser) - def test_replace_mixed_types(self): s = Series(np.arange(5),dtype='int64') @@ -5987,6 +6174,43 @@ def test_concat_empty_series_dtypes(self): self.assertEqual(pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype, np.int32) + def test_searchsorted_numeric_dtypes_scalar(self): + s = Series([1, 2, 90, 1000, 3e9]) + r = s.searchsorted(30) + e = 2 + tm.assert_equal(r, e) + + r = s.searchsorted([30]) + e = np.array([2]) + tm.assert_array_equal(r, e) + + def test_searchsorted_numeric_dtypes_vector(self): + s = Series([1, 2, 90, 1000, 3e9]) + r = s.searchsorted([91, 2e6]) + e = np.array([3, 4]) + tm.assert_array_equal(r, e) + + def test_search_sorted_datetime64_scalar(self): + s = Series(pd.date_range('20120101', periods=10, freq='2D')) + v = pd.Timestamp('20120102') + r = s.searchsorted(v) + e = 1 + tm.assert_equal(r, e) + + def test_search_sorted_datetime64_list(self): + s = Series(pd.date_range('20120101', periods=10, freq='2D')) + v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')] + r = s.searchsorted(v) + e = np.array([1, 2]) + tm.assert_array_equal(r, e) + + def test_searchsorted_sorter(self): + # GH8490 + s = Series([3, 1, 2]) + r = s.searchsorted([0, 3], sorter=np.argsort(s)) + e = np.array([0, 2]) + tm.assert_array_equal(r, e) + class TestSeriesNonUnique(tm.TestCase): @@ -6144,4 +6368,3 @@ def test_unique_data_ownership(self): if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) - diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py new file mode 100644 index 0000000000000..8c56ba0e0f548 --- /dev/null +++ b/pandas/tests/test_util.py @@ -0,0 +1,80 @@ + +import warnings + +import nose + +import pandas.util +from pandas.util.decorators import deprecate_kwarg +import pandas.util.testing as tm + +class TestDecorators(tm.TestCase): + def setUp(self): + @deprecate_kwarg('old', 'new') + def _f1(new=False): + return new + + @deprecate_kwarg('old', 'new', {'yes': True, 'no': False}) + def _f2(new=False): + return new + + @deprecate_kwarg('old', 'new', lambda x: x+1) + def _f3(new=0): + return new + + self.f1 = _f1 + self.f2 = _f2 + self.f3 = _f3 + + def test_deprecate_kwarg(self): + x = 78 + with tm.assert_produces_warning(FutureWarning): + result = self.f1(old=x) + self.assertIs(result, x) + with tm.assert_produces_warning(None): + self.f1(new=x) + + def test_dict_deprecate_kwarg(self): + x = 'yes' + with tm.assert_produces_warning(FutureWarning): + result = self.f2(old=x) + self.assertEqual(result, True) + + def test_missing_deprecate_kwarg(self): + x = 'bogus' + with tm.assert_produces_warning(FutureWarning): + result = self.f2(old=x) + self.assertEqual(result, 'bogus') + + def test_callable_deprecate_kwarg(self): + x = 5 + with tm.assert_produces_warning(FutureWarning): + result = self.f3(old=x) + self.assertEqual(result, x+1) + with tm.assertRaises(TypeError): + self.f3(old='hello') + + def test_bad_deprecate_kwarg(self): + with tm.assertRaises(TypeError): + @deprecate_kwarg('old', 'new', 0) + def f4(new=None): + pass + + +def test_rands(): + r = tm.rands(10) + assert(len(r) == 10) + + +def test_rands_array(): + arr = tm.rands_array(5, size=10) + assert(arr.shape == (10,)) + assert(len(arr[0]) == 5) + + arr = tm.rands_array(7, size=(10, 10)) + assert(arr.shape == (10, 10)) + assert(len(arr[1, 1]) == 7) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/describe.py b/pandas/tools/describe.py deleted file mode 100644 index eca5a800b3c6c..0000000000000 --- a/pandas/tools/describe.py +++ /dev/null @@ -1,17 +0,0 @@ -from pandas.core.series import Series - - -def value_range(df): - """ - Return the minimum and maximum of a dataframe in a series object - - Parameters - ---------- - df : DataFrame - - Returns - ------- - (maximum, minimum) : Series - - """ - return Series((min(df.min()), max(df.max())), ('Minimum', 'Maximum')) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3a5c191148fe6..8fddfdda797c6 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -41,7 +41,7 @@ def merge(left, right, how='inner', on=None, left_on=None, right_on=None, merge.__doc__ = _merge_doc % '\nleft : DataFrame' -class MergeError(Exception): +class MergeError(ValueError): pass @@ -238,7 +238,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer)) - elif left_indexer is not None: + elif left_indexer is not None \ + and isinstance(self.left_join_keys[i], np.ndarray): + if name is None: name = 'key_%d' % i @@ -566,9 +568,6 @@ def _get_single_indexer(join_key, index, sort=False): def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): - join_index = left_ax - left_indexer = None - if len(join_keys) > 1: if not ((isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels)): @@ -577,22 +576,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): "number of join keys must be the number of " "levels in right_ax") - left_tmp, right_indexer = \ - _get_multiindex_indexer(join_keys, right_ax, - sort=sort) - if sort: - left_indexer = left_tmp - join_index = left_ax.take(left_indexer) + left_indexer, right_indexer = \ + _get_multiindex_indexer(join_keys, right_ax, sort=sort) else: jkey = join_keys[0] - if sort: - left_indexer, right_indexer = \ - _get_single_indexer(jkey, right_ax, sort=sort) - join_index = left_ax.take(left_indexer) - else: - right_indexer = right_ax.get_indexer(jkey) - return join_index, left_indexer, right_indexer + left_indexer, right_indexer = \ + _get_single_indexer(jkey, right_ax, sort=sort) + + if sort or len(left_ax) != len(left_indexer): + # if asked to sort or there are 1-to-many matches + join_index = left_ax.take(left_indexer) + return join_index, left_indexer, right_indexer + else: + # left frame preserves order & length of its index + return left_ax, None, right_indexer def _right_outer_join(x, y, max_groups): @@ -668,7 +666,7 @@ def _sort_labels(uniques, left, right): def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False): + keys=None, levels=None, names=None, verify_integrity=False, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Can also add a layer of hierarchical indexing on the @@ -681,7 +679,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless - they are all None in which case an Exception will be raised + they are all None in which case a ValueError will be raised axis : {0, 1, ...}, default 0 The axis to concatenate along join : {'inner', 'outer'}, default 'outer' @@ -706,6 +704,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, concatenating objects where the concatenation axis does not have meaningful indexing information. Note the the index values on the other axes are still respected in the join. + copy : boolean, default True + If False, do not copy data unnecessarily Notes ----- @@ -718,7 +718,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, op = _Concatenator(objs, axis=axis, join_axes=join_axes, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + copy=copy) return op.get_result() @@ -729,7 +730,7 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False): + ignore_index=False, verify_integrity=False, copy=True): if not isinstance(objs, (list,tuple,types.GeneratorType,dict,TextFileReader)): raise TypeError('first argument must be a list-like of pandas ' 'objects, you passed an object of type ' @@ -763,7 +764,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys = clean_keys if len(objs) == 0: - raise Exception('All objects passed were None') + raise ValueError('All objects passed were None') # consolidate data & figure out what our result ndim is going to be ndims = set() @@ -848,6 +849,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.ignore_index = ignore_index self.verify_integrity = verify_integrity + self.copy = copy self.new_axes = self._get_new_axes() @@ -881,7 +883,9 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) + if not self.copy: + new_data._consolidate_inplace() return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') @@ -993,7 +997,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array(zp).levels for zp in zipped] + levels = [Categorical.from_array(zp).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: @@ -1032,7 +1036,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index) - levels.append(factor.levels) + levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 83df908d8033f..ef477582b82f2 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -116,7 +116,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', table = agged if table.index.nlevels > 1: - to_unstack = [agged.index.names[i] + to_unstack = [agged.index.names[i] or i for i in range(len(index), len(keys))] table = agged.unstack(to_unstack) @@ -207,6 +207,11 @@ def _compute_grand_margin(data, values, aggfunc): try: if isinstance(aggfunc, compat.string_types): grand_margin[k] = getattr(v, aggfunc)() + elif isinstance(aggfunc, dict): + if isinstance(aggfunc[k], compat.string_types): + grand_margin[k] = getattr(v, aggfunc[k])() + else: + grand_margin[k] = aggfunc[k](v) else: grand_margin[k] = aggfunc(v) except TypeError: diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 954a88bc9d1ad..0e477d8eedb98 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -3,6 +3,7 @@ import datetime import warnings import re +from math import ceil from collections import namedtuple from contextlib import contextmanager from distutils.version import LooseVersion @@ -116,7 +117,10 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', colors = color else: if color_type == 'default': - colors = plt.rcParams.get('axes.color_cycle', list('bgrcmyk')) + # need to call list() on the result to copy so we don't + # modify the global rcParams below + colors = list(plt.rcParams.get('axes.color_cycle', + list('bgrcmyk'))) if isinstance(colors, compat.string_types): colors = list(colors) elif color_type == 'random': @@ -246,7 +250,8 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, df = frame._get_numeric_data() n = df.columns.size - fig, axes = _subplots(nrows=n, ncols=n, figsize=figsize, ax=ax, + naxes = n * n + fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots @@ -401,10 +406,10 @@ def normalize(series): for kls in classes: to_plot[kls] = [[], []] - n = len(frame.columns) - 1 + m = len(frame.columns) - 1 s = np.array([(np.cos(t), np.sin(t)) - for t in [2.0 * np.pi * (i / float(n)) - for i in range(n)]]) + for t in [2.0 * np.pi * (i / float(m)) + for i in range(m)]]) for i in range(n): row = df.iloc[i].values @@ -752,6 +757,7 @@ class MPLPlot(object): data : """ + _layout_type = 'vertical' _default_rot = 0 orientation = None @@ -767,7 +773,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, xticks=None, yticks=None, sort_columns=False, fontsize=None, secondary_y=False, colormap=None, - table=False, **kwds): + table=False, layout=None, **kwds): self.data = data self.by = by @@ -780,6 +786,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, self.sharex = sharex self.sharey = sharey self.figsize = figsize + self.layout = layout self.xticks = xticks self.yticks = yticks @@ -792,7 +799,11 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, if rot is not None: self.rot = rot + # need to know for format_date_labels since it's rotated to 30 by + # default + self._rot_set = True else: + self._rot_set = False if isinstance(self._default_rot, dict): self.rot = self._default_rot[self.kind] else: @@ -841,16 +852,13 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, self._validate_color_args() def _validate_color_args(self): - from pandas import DataFrame if 'color' not in self.kwds and 'colors' in self.kwds: warnings.warn(("'colors' is being deprecated. Please use 'color'" "instead of 'colors'")) colors = self.kwds.pop('colors') self.kwds['color'] = colors - if ('color' in self.kwds and - (isinstance(self.data, Series) or - isinstance(self.data, DataFrame) and len(self.data.columns) == 1)): + if ('color' in self.kwds and self.nseries == 1): # support series.plot(color='green') self.kwds['color'] = [self.kwds['color']] @@ -867,9 +875,11 @@ def _validate_color_args(self): " use one or the other or pass 'style' " "without a color symbol") - def _iter_data(self, data=None, keep_index=False): + def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: data = self.data + if fillna is not None: + data = data.fillna(fillna) from pandas.core.frame import DataFrame if isinstance(data, (Series, np.ndarray, Index)): @@ -932,22 +942,22 @@ def _maybe_right_yaxis(self, ax): def _setup_subplots(self): if self.subplots: - nrows, ncols = self._get_layout() - fig, axes = _subplots(nrows=nrows, ncols=ncols, + fig, axes = _subplots(naxes=self.nseries, sharex=self.sharex, sharey=self.sharey, - figsize=self.figsize, ax=self.ax) - if not com.is_list_like(axes): - axes = np.array([axes]) + figsize=self.figsize, ax=self.ax, + layout=self.layout, + layout_type=self._layout_type) else: if self.ax is None: fig = self.plt.figure(figsize=self.figsize) - ax = fig.add_subplot(111) + axes = fig.add_subplot(111) else: fig = self.ax.get_figure() if self.figsize is not None: fig.set_size_inches(self.figsize) - ax = self.ax - axes = [ax] + axes = self.ax + + axes = _flatten(axes) if self.logx or self.loglog: [a.set_xscale('log') for a in axes] @@ -957,12 +967,18 @@ def _setup_subplots(self): self.fig = fig self.axes = axes - def _get_layout(self): - from pandas.core.frame import DataFrame - if isinstance(self.data, DataFrame): - return (len(self.data.columns), 1) + @property + def result(self): + """ + Return result axes + """ + if self.subplots: + if self.layout is not None and not com.is_list_like(self.ax): + return self.axes.reshape(*self.layout) + else: + return self.axes else: - return (1, 1) + return self.axes[0] def _compute_plot_data(self): numeric_data = self.data.convert_objects()._get_numeric_data() @@ -1245,12 +1261,8 @@ def _get_style(self, i, col_name): return style or None def _get_colors(self, num_colors=None, color_kwds='color'): - from pandas.core.frame import DataFrame if num_colors is None: - if isinstance(self.data, DataFrame): - num_colors = len(self.data.columns) - else: - num_colors = 1 + num_colors = self.nseries return _get_standard_colors(num_colors=num_colors, colormap=self.colormap, @@ -1360,40 +1372,69 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): class ScatterPlot(MPLPlot): - def __init__(self, data, x, y, **kwargs): + _layout_type = 'single' + + def __init__(self, data, x, y, c=None, **kwargs): MPLPlot.__init__(self, data, **kwargs) - self.kwds.setdefault('c', self.plt.rcParams['patch.facecolor']) if x is None or y is None: raise ValueError( 'scatter requires and x and y column') if com.is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if com.is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] + if com.is_integer(c) and not self.data.columns.holds_integer(): + c = self.data.columns[c] self.x = x self.y = y + self.c = c - def _get_layout(self): - return (1, 1) + @property + def nseries(self): + return 1 def _make_plot(self): - x, y, data = self.x, self.y, self.data + import matplotlib as mpl + mpl_ge_1_3_1 = str(mpl.__version__) >= LooseVersion('1.3.1') + + import matplotlib.pyplot as plt + + x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] + # plot a colorbar only if a colormap is provided or necessary + cb = self.kwds.pop('colorbar', self.colormap or c in self.data.columns) + + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or 'Greys' + cmap = plt.cm.get_cmap(cmap) + + if c is None: + c_values = self.plt.rcParams['patch.facecolor'] + elif c in self.data.columns: + c_values = self.data[c].values + else: + c_values = c + if self.legend and hasattr(self, 'label'): label = self.label else: label = None - scatter = ax.scatter(data[x].values, data[y].values, label=label, - **self.kwds) + scatter = ax.scatter(data[x].values, data[y].values, c=c_values, + label=label, cmap=cmap, **self.kwds) + if cb: + img = ax.collections[0] + kws = dict(ax=ax) + if mpl_ge_1_3_1: + kws['label'] = c if c in self.data.columns else '' + self.fig.colorbar(img, **kws) self._add_legend_handle(scatter, label) errors_x = self._get_errorbars(label=x, index=0, yerr=False) - errors_y = self._get_errorbars(label=y, index=1, xerr=False) + errors_y = self._get_errorbars(label=y, index=0, xerr=False) if len(errors_x) > 0 or len(errors_y) > 0: err_kwds = dict(errors_x, **errors_y) - if 'color' in self.kwds: - err_kwds['color'] = self.kwds['color'] + err_kwds['ecolor'] = scatter.get_facecolor()[0] ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds) def _post_plot_logic(self): @@ -1404,6 +1445,8 @@ def _post_plot_logic(self): class HexBinPlot(MPLPlot): + _layout_type = 'single' + def __init__(self, data, x, y, C=None, **kwargs): MPLPlot.__init__(self, data, **kwargs) @@ -1421,8 +1464,9 @@ def __init__(self, data, x, y, C=None, **kwargs): self.y = y self.C = C - def _get_layout(self): - return (1, 1) + @property + def nseries(self): + return 1 def _make_plot(self): import matplotlib.pyplot as plt @@ -1454,7 +1498,7 @@ def _post_plot_logic(self): class LinePlot(MPLPlot): - _default_rot = 30 + _default_rot = 0 orientation = 'vertical' def __init__(self, data, **kwargs): @@ -1635,6 +1679,10 @@ def _post_plot_logic(self): for ax in self.axes: if condition: + # irregular TS rotated 30 deg. by default + # probably a better place to check / set this. + if not self._rot_set: + self.rot = 30 format_date_labels(ax, rot=self.rot) if index_name is not None: @@ -1657,7 +1705,7 @@ def _get_plot_function(self): raise ValueError("Log-y scales are not supported in area plot") else: f = MPLPlot._get_plot_function(self) - def plotf(ax, x, y, style=None, column_num=0, **kwds): + def plotf(ax, x, y, style=None, column_num=None, **kwds): if column_num == 0: self._initialize_prior(len(self.data)) y_values = self._get_stacked_values(y, kwds['label']) @@ -1712,8 +1760,8 @@ def __init__(self, data, **kwargs): kwargs.setdefault('align', 'center') self.tick_pos = np.arange(len(data)) - self.bottom = kwargs.pop('bottom', None) - self.left = kwargs.pop('left', None) + self.bottom = kwargs.pop('bottom', 0) + self.left = kwargs.pop('left', 0) self.log = kwargs.pop('log',False) MPLPlot.__init__(self, data, **kwargs) @@ -1744,13 +1792,11 @@ def _args_adjust(self): def _get_plot_function(self): if self.kind == 'bar': def f(ax, x, y, w, start=None, **kwds): - if self.bottom is not None: - start = start + self.bottom + start = start + self.bottom return ax.bar(x, y, w, bottom=start,log=self.log, **kwds) elif self.kind == 'barh': def f(ax, x, y, w, start=None, log=self.log, **kwds): - if self.left is not None: - start = start + self.left + start = start + self.left return ax.barh(x, y, w, left=start, **kwds) else: raise NotImplementedError @@ -1770,7 +1816,7 @@ def _make_plot(self): pos_prior = neg_prior = np.zeros(len(self.data)) K = self.nseries - for i, (label, y) in enumerate(self._iter_data()): + for i, (label, y) in enumerate(self._iter_data(fillna=0)): ax = self._get_ax(i) kwds = self.kwds.copy() kwds['color'] = colors[i % ncolors] @@ -1944,6 +1990,7 @@ def _get_plot_function(self): from scipy import __version__ as spv f = MPLPlot._get_plot_function(self) def plotf(ax, y, style=None, column_num=None, **kwds): + y = remove_na(y) if LooseVersion(spv) >= '0.11.0': gkde = gaussian_kde(y, bw_method=self.bw_method) else: @@ -1965,6 +2012,7 @@ def _post_plot_logic(self): class PiePlot(MPLPlot): + _layout_type = 'horizontal' def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) @@ -1978,13 +2026,6 @@ def _args_adjust(self): self.logx = False self.loglog = False - def _get_layout(self): - from pandas import DataFrame - if isinstance(self.data, DataFrame): - return (1, len(self.data.columns)) - else: - return (1, 1) - def _validate_color_args(self): pass @@ -2000,10 +2041,23 @@ def _make_plot(self): kwds = self.kwds.copy() + def blank_labeler(label, value): + if value == 0: + return '' + else: + return label + idx = [com.pprint_thing(v) for v in self.data.index] labels = kwds.pop('labels', idx) # labels is used for each wedge's labels - results = ax.pie(y, labels=labels, **kwds) + # Blank out labels for values of 0 so they don't overlap + # with nonzero wedges + if labels is not None: + blabels = [blank_labeler(label, value) for + label, value in zip(labels, y)] + else: + blabels = None + results = ax.pie(y, labels=blabels, **kwds) if kwds.get('autopct', None) is not None: patches, texts, autotexts = results @@ -2021,12 +2075,156 @@ def _make_plot(self): self._add_legend_handle(p, l) -class BoxPlot(MPLPlot): - pass +class BoxPlot(LinePlot): + _layout_type = 'horizontal' + + _valid_return_types = (None, 'axes', 'dict', 'both') + # namedtuple to hold results + BP = namedtuple("Boxplot", ['ax', 'lines']) + + def __init__(self, data, return_type=None, **kwargs): + # Do not call LinePlot.__init__ which may fill nan + if return_type not in self._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + + self.return_type = return_type + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if self.subplots: + # Disable label ax sharing. Otherwise, all subplots shows last column label + if self.orientation == 'vertical': + self.sharex = False + else: + self.sharey = False + + def _get_plot_function(self): + def plotf(ax, y, column_num=None, **kwds): + if y.ndim == 2: + y = [remove_na(v) for v in y] + # Boxplot fails with empty arrays, so need to add a NaN + # if any cols are empty + # GH 8181 + y = [v if v.size > 0 else np.array([np.nan]) for v in y] + else: + y = remove_na(y) + bp = ax.boxplot(y, **kwds) + + if self.return_type == 'dict': + return bp, bp + elif self.return_type == 'both': + return self.BP(ax=ax, lines=bp), bp + else: + return ax, bp + return plotf + + def _validate_color_args(self): + if 'color' in self.kwds: + if self.colormap is not None: + warnings.warn("'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'") + self.color = self.kwds.pop('color') + + if isinstance(self.color, dict): + valid_keys = ['boxes', 'whiskers', 'medians', 'caps'] + for key, values in compat.iteritems(self.color): + if key not in valid_keys: + raise ValueError("color dict contains invalid key '{0}' " + "The key must be either {1}".format(key, valid_keys)) + else: + self.color = None + + # get standard colors for default + colors = _get_standard_colors(num_colors=3, + colormap=self.colormap, + color=None) + # use 2 colors by default, for box/whisker and median + # flier colors isn't needed here + # because it can be specified by ``sym`` kw + self._boxes_c = colors[0] + self._whiskers_c = colors[0] + self._medians_c = colors[2] + self._caps_c = 'k' # mpl default + + def _get_colors(self, num_colors=None, color_kwds='color'): + pass + + def maybe_color_bp(self, bp): + if isinstance(self.color, dict): + boxes = self.color.get('boxes', self._boxes_c) + whiskers = self.color.get('whiskers', self._whiskers_c) + medians = self.color.get('medians', self._medians_c) + caps = self.color.get('caps', self._caps_c) + else: + # Other types are forwarded to matplotlib + # If None, use default colors + boxes = self.color or self._boxes_c + whiskers = self.color or self._whiskers_c + medians = self.color or self._medians_c + caps = self.color or self._caps_c + + from matplotlib.artist import setp + setp(bp['boxes'], color=boxes, alpha=1) + setp(bp['whiskers'], color=whiskers, alpha=1) + setp(bp['medians'], color=medians, alpha=1) + setp(bp['caps'], color=caps, alpha=1) + + def _make_plot(self): + plotf = self._get_plot_function() + if self.subplots: + self._return_obj = compat.OrderedDict() + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + kwds = self.kwds.copy() + + ret, bp = plotf(ax, y, column_num=i, **kwds) + self.maybe_color_bp(bp) + self._return_obj[label] = ret + + label = [com.pprint_thing(label)] + self._set_ticklabels(ax, label) + else: + y = self.data.values.T + ax = self._get_ax(0) + kwds = self.kwds.copy() + + ret, bp = plotf(ax, y, column_num=0, **kwds) + self.maybe_color_bp(bp) + self._return_obj = ret + + labels = [l for l, y in self._iter_data()] + labels = [com.pprint_thing(l) for l in labels] + if not self.use_index: + labels = [com.pprint_thing(key) for key in range(len(labels))] + self._set_ticklabels(ax, labels) + + def _set_ticklabels(self, ax, labels): + if self.orientation == 'vertical': + ax.set_xticklabels(labels) + else: + ax.set_yticklabels(labels) + + def _post_plot_logic(self): + pass + + @property + def orientation(self): + if self.kwds.get('vert', True): + return 'vertical' + else: + return 'horizontal' + + @property + def result(self): + if self.return_type is None: + return super(BoxPlot, self).result + else: + return self._return_obj # kinds supported by both dataframe and series -_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist'] +_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', 'box'] # kinds supported by dataframe _dataframe_kinds = ['scatter', 'hexbin'] # kinds supported only by series or dataframe single column @@ -2034,62 +2232,150 @@ class BoxPlot(MPLPlot): _all_kinds = _common_kinds + _dataframe_kinds + _series_kinds _plot_klass = {'line': LinePlot, 'bar': BarPlot, 'barh': BarPlot, - 'kde': KdePlot, 'hist': HistPlot, + 'kde': KdePlot, 'hist': HistPlot, 'box': BoxPlot, 'scatter': ScatterPlot, 'hexbin': HexBinPlot, 'area': AreaPlot, 'pie': PiePlot} -def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, - sharey=False, use_index=True, figsize=None, grid=None, - legend=True, rot=None, ax=None, style=None, title=None, - xlim=None, ylim=None, logx=False, logy=False, xticks=None, - yticks=None, kind='line', sort_columns=False, fontsize=None, - secondary_y=False, **kwds): +def _plot(data, x=None, y=None, subplots=False, + ax=None, kind='line', **kwds): + kind = _get_standard_kind(kind.lower().strip()) + if kind in _all_kinds: + klass = _plot_klass[kind] + else: + raise ValueError('Invalid chart type given %s' % kind) - """ - Make line, bar, or scatter plots of DataFrame series with the index on the x-axis - using matplotlib / pylab. + from pandas import DataFrame + if kind in _dataframe_kinds: + if isinstance(data, DataFrame): + plot_obj = klass(data, x=x, y=y, subplots=subplots, ax=ax, + kind=kind, **kwds) + else: + raise ValueError('Invalid chart type given %s' % kind) - Parameters - ---------- - frame : DataFrame - x : label or position, default None + elif kind in _series_kinds: + if isinstance(data, DataFrame): + if y is None and subplots is False: + msg = "{0} requires either y column or 'subplots=True'" + raise ValueError(msg.format(kind)) + elif y is not None: + if com.is_integer(y) and not data.columns.holds_integer(): + y = data.columns[y] + data = data[y] # converted to series actually + data.index.name = y + plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds) + else: + if isinstance(data, DataFrame): + if x is not None: + if com.is_integer(x) and not data.columns.holds_integer(): + x = data.columns[x] + data = data.set_index(x) + + if y is not None: + if com.is_integer(y) and not data.columns.holds_integer(): + y = data.columns[y] + label = x if x is not None else data.index.name + label = kwds.pop('label', label) + series = data[y] + series.index.name = label + + for kw in ['xerr', 'yerr']: + if (kw in kwds) and \ + (isinstance(kwds[kw], string_types) or + com.is_integer(kwds[kw])): + try: + kwds[kw] = data[kwds[kw]] + except (IndexError, KeyError, TypeError): + pass + data = series + plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds) + + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +df_kind = """- 'scatter' : scatter plot + - 'hexbin' : hexbin plot""" +series_kind = "" + +df_coord = """x : label or position, default None y : label or position, default None - Allows plotting of one column versus another - yerr : DataFrame (with matching labels), Series, list-type (tuple, list, - ndarray), or str of column name containing y error values - xerr : similar functionality as yerr, but for x error values + Allows plotting of one column versus another""" +series_coord = "" + +df_unique = """stacked : boolean, default False in line and + bar plots, and True in area plot. If True, create stacked plot. + sort_columns : boolean, default False + Sort column names to determine plot ordering + secondary_y : boolean or sequence, default False + Whether to plot on the secondary y-axis + If a list/tuple, which columns to plot on secondary y-axis""" +series_unique = """label : label argument to provide to plot + secondary_y : boolean or sequence of ints, default False + If True then y-axis will be on the right""" + +df_ax = """ax : matplotlib axes object, default None subplots : boolean, default False - Make separate subplots for each time series + Make separate subplots for each column sharex : boolean, default True In case subplots=True, share x axis sharey : boolean, default False In case subplots=True, share y axis + layout : tuple (optional) + (rows, columns) for the layout of subplots""" +series_ax = """ax : matplotlib axes object + If not passed, uses gca()""" + +df_note = """- If `kind` = 'scatter' and the argument `c` is the name of a dataframe + column, the values of that column are used to color each point. + - If `kind` = 'hexbin', you can control the size of the bins with the + `gridsize` argument. By default, a histogram of the counts around each + `(x, y)` point is computed. You can specify alternative aggregations + by passing values to the `C` and `reduce_C_function` arguments. + `C` specifies the value at each `(x, y)` point and `reduce_C_function` + is a function of one argument that reduces all the values in a bin to + a single number (e.g. `mean`, `max`, `sum`, `std`).""" +series_note = "" + +_shared_doc_df_kwargs = dict(klass='DataFrame', klass_kind=df_kind, + klass_coord=df_coord, klass_ax=df_ax, + klass_unique=df_unique, klass_note=df_note) +_shared_doc_series_kwargs = dict(klass='Series', klass_kind=series_kind, + klass_coord=series_coord, klass_ax=series_ax, + klass_unique=series_unique, + klass_note=series_note) + +_shared_docs['plot'] = """ + Make plots of %(klass)s using matplotlib / pylab. + + Parameters + ---------- + data : %(klass)s + %(klass_coord)s + kind : str + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + %(klass_kind)s + %(klass_ax)s + figsize : a tuple (width, height) in inches use_index : boolean, default True Use index as ticks for x axis - stacked : boolean, default False - If True, create stacked bar plot. Only valid for DataFrame input - sort_columns: boolean, default False - Sort column names to determine plot ordering title : string Title to use for the plot grid : boolean, default None (matlab style default) Axis grid lines legend : False/True/'reverse' Place legend on axis subplots - - ax : matplotlib axis object, default None style : list or dict matplotlib line style per column - kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area', 'scatter', 'hexbin'} - line : line plot - bar : vertical bar plot - barh : horizontal bar plot - hist : histogram - kde/density : Kernel Density Estimation plot - area : area plot - scatter : scatter plot - hexbin : hexbin plot logx : boolean, default False Use log scaling on x axis logy : boolean, default False @@ -2104,214 +2390,101 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, ylim : 2-tuple/list rot : int, default None Rotation for ticks - secondary_y : boolean or sequence, default False - Whether to plot on the secondary y-axis - If a list/tuple, which columns to plot on secondary y-axis - mark_right: boolean, default True - When using a secondary_y axis, should the legend label the axis of - the various columns automatically + fontsize : int, default None + Font size for ticks colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. + colorbar : boolean, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots) position : float Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) + layout : tuple (optional) + (rows, columns) for the layout of the plot table : boolean, Series or DataFrame, default False If True, draw a table using the data in the DataFrame and the data will be transposed to meet matplotlib's default layout. If a Series or DataFrame is passed, use passed data to draw a table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for detail. + xerr : same types as yerr. + %(klass_unique)s + mark_right : boolean, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend kwds : keywords Options to pass to matplotlib plotting method Returns ------- - ax_or_axes : matplotlib.AxesSubplot or list of them + axes : matplotlib.AxesSubplot or np.array of them Notes ----- - If `kind`='hexbin', you can control the size of the bins with the - `gridsize` argument. By default, a histogram of the counts around each - `(x, y)` point is computed. You can specify alternative aggregations - by passing values to the `C` and `reduce_C_function` arguments. - `C` specifies the value at each `(x, y)` point and `reduce_C_function` - is a function of one argument that reduces all the values in a bin to - a single number (e.g. `mean`, `max`, `sum`, `std`). - """ - - kind = _get_standard_kind(kind.lower().strip()) - if kind in _all_kinds: - klass = _plot_klass[kind] - else: - raise ValueError('Invalid chart type given %s' % kind) - - if kind in _dataframe_kinds: - plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots, - rot=rot,legend=legend, ax=ax, style=style, - fontsize=fontsize, use_index=use_index, sharex=sharex, - sharey=sharey, xticks=xticks, yticks=yticks, - xlim=xlim, ylim=ylim, title=title, grid=grid, - figsize=figsize, logx=logx, logy=logy, - sort_columns=sort_columns, secondary_y=secondary_y, - **kwds) - elif kind in _series_kinds: - if y is None and subplots is False: - msg = "{0} requires either y column or 'subplots=True'" - raise ValueError(msg.format(kind)) - elif y is not None: - if com.is_integer(y) and not frame.columns.holds_integer(): - y = frame.columns[y] - frame = frame[y] # converted to series actually - frame.index.name = y - - plot_obj = klass(frame, kind=kind, subplots=subplots, - rot=rot,legend=legend, ax=ax, style=style, - fontsize=fontsize, use_index=use_index, sharex=sharex, - sharey=sharey, xticks=xticks, yticks=yticks, - xlim=xlim, ylim=ylim, title=title, grid=grid, - figsize=figsize, - sort_columns=sort_columns, - **kwds) - else: - if x is not None: - if com.is_integer(x) and not frame.columns.holds_integer(): - x = frame.columns[x] - frame = frame.set_index(x) - - if y is not None: - if com.is_integer(y) and not frame.columns.holds_integer(): - y = frame.columns[y] - label = x if x is not None else frame.index.name - label = kwds.pop('label', label) - ser = frame[y] - ser.index.name = label - - for kw in ['xerr', 'yerr']: - if (kw in kwds) and \ - (isinstance(kwds[kw], string_types) or com.is_integer(kwds[kw])): - try: - kwds[kw] = frame[kwds[kw]] - except (IndexError, KeyError, TypeError): - pass - - return plot_series(ser, label=label, kind=kind, - use_index=use_index, - rot=rot, xticks=xticks, yticks=yticks, - xlim=xlim, ylim=ylim, ax=ax, style=style, - grid=grid, logx=logx, logy=logy, - secondary_y=secondary_y, title=title, - figsize=figsize, fontsize=fontsize, **kwds) + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) + %(klass_note)s - else: - plot_obj = klass(frame, kind=kind, subplots=subplots, rot=rot, - legend=legend, ax=ax, style=style, fontsize=fontsize, - use_index=use_index, sharex=sharex, sharey=sharey, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - title=title, grid=grid, figsize=figsize, logx=logx, - logy=logy, sort_columns=sort_columns, - secondary_y=secondary_y, **kwds) - - plot_obj.generate() - plot_obj.draw() - if subplots: - return plot_obj.axes - else: - return plot_obj.axes[0] - - -def plot_series(series, label=None, kind='line', use_index=True, rot=None, - xticks=None, yticks=None, xlim=None, ylim=None, - ax=None, style=None, grid=None, legend=False, logx=False, - logy=False, secondary_y=False, **kwds): """ - Plot the input series with the index on the x-axis using matplotlib - Parameters - ---------- - label : label argument to provide to plot - kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area'} - line : line plot - bar : vertical bar plot - barh : horizontal bar plot - hist : histogram - kde/density : Kernel Density Estimation plot - area : area plot - use_index : boolean, default True - Plot index as axis tick labels - rot : int, default None - Rotation for tick labels - xticks : sequence - Values to use for the xticks - yticks : sequence - Values to use for the yticks - xlim : 2-tuple/list - ylim : 2-tuple/list - ax : matplotlib axis object - If not passed, uses gca() - style : string, default matplotlib default - matplotlib line style to use - grid : matplotlib grid - legend: matplotlib legend - logx : boolean, default False - Use log scaling on x axis - logy : boolean, default False - Use log scaling on y axis - loglog : boolean, default False - Use log scaling on both x and y axes - secondary_y : boolean or sequence of ints, default False - If True then y-axis will be on the right - figsize : a tuple (width, height) in inches - position : float - Specify relative alignments for bar plot layout. - From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) - table : boolean, Series or DataFrame, default False - If True, draw a table using the data in the Series and the data will - be transposed to meet matplotlib's default layout. - If a Series or DataFrame is passed, use passed data to draw a table. - kwds : keywords - Options to pass to matplotlib plotting method - - Notes - ----- - See matplotlib documentation online for more on this subject - """ - - kind = _get_standard_kind(kind.lower().strip()) - if kind in _common_kinds or kind in _series_kinds: - klass = _plot_klass[kind] - else: - raise ValueError('Invalid chart type given %s' % kind) +@Appender(_shared_docs['plot'] % _shared_doc_df_kwargs) +def plot_frame(data, x=None, y=None, kind='line', ax=None, # Dataframe unique + subplots=False, sharex=True, sharey=False, layout=None, # Dataframe unique + figsize=None, use_index=True, title=None, grid=None, + legend=True, style=None, logx=False, logy=False, loglog=False, + xticks=None, yticks=None, xlim=None, ylim=None, + rot=None, fontsize=None, colormap=None, table=False, + yerr=None, xerr=None, + secondary_y=False, sort_columns=False, # Dataframe unique + **kwds): + return _plot(data, kind=kind, x=x, y=y, ax=ax, + subplots=subplots, sharex=sharex, sharey=sharey, + layout=layout, figsize=figsize, use_index=use_index, + title=title, grid=grid, legend=legend, + style=style, logx=logx, logy=logy, loglog=loglog, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + rot=rot, fontsize=fontsize, colormap=colormap, table=table, + yerr=yerr, xerr=xerr, + secondary_y=secondary_y, sort_columns=sort_columns, + **kwds) + + +@Appender(_shared_docs['plot'] % _shared_doc_series_kwargs) +def plot_series(data, kind='line', ax=None, # Series unique + figsize=None, use_index=True, title=None, grid=None, + legend=False, style=None, logx=False, logy=False, loglog=False, + xticks=None, yticks=None, xlim=None, ylim=None, + rot=None, fontsize=None, colormap=None, table=False, + yerr=None, xerr=None, + label=None, secondary_y=False, # Series unique + **kwds): + import matplotlib.pyplot as plt """ - If no axis is specified, we check whether there are existing figures. - If so, we get the current axis and check whether yaxis ticks are on the - right. Ticks for the plot of the series will be on the right unless - there is at least one axis with ticks on the left. - - If we do not check for whether there are existing figures, _gca() will - create a figure with the default figsize, causing the figsize= parameter to + If no axes is specified, check whether there are existing figures + If there is no existing figures, _gca() will + create a figure with the default figsize, causing the figsize=parameter to be ignored. """ - import matplotlib.pyplot as plt if ax is None and len(plt.get_fignums()) > 0: ax = _gca() ax = getattr(ax, 'left_ax', ax) - # is there harm in this? if label is None: - label = series.name - - plot_obj = klass(series, kind=kind, rot=rot, logx=logx, logy=logy, - ax=ax, use_index=use_index, style=style, - xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, - legend=legend, grid=grid, label=label, - secondary_y=secondary_y, **kwds) - - plot_obj.generate() - plot_obj.draw() - - # plot_obj.ax is None if we created the first figure - return plot_obj.axes[0] + label = data.name + return _plot(data, kind=kind, ax=ax, + figsize=figsize, use_index=use_index, title=title, + grid=grid, legend=legend, + style=style, logx=logx, logy=logy, loglog=loglog, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + rot=rot, fontsize=fontsize, colormap=colormap, table=table, + yerr=yerr, xerr=xerr, + label=label, secondary_y=secondary_y, + **kwds) _shared_docs['boxplot'] = """ @@ -2364,9 +2537,8 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, **kwds): # validate return_type: - valid_types = (None, 'axes', 'dict', 'both') - if return_type not in valid_types: - raise ValueError("return_type") + if return_type not in BoxPlot._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") from pandas import Series, DataFrame if isinstance(data, Series): @@ -2377,13 +2549,11 @@ def _get_colors(): return _get_standard_colors(color=kwds.get('color'), num_colors=1) def maybe_color_bp(bp): - if 'color' not in kwds : + if 'color' not in kwds: from matplotlib.artist import setp - setp(bp['boxes'],color=colors[0],alpha=1) - setp(bp['whiskers'],color=colors[0],alpha=1) - setp(bp['medians'],color=colors[2],alpha=1) - - BP = namedtuple("Boxplot", ['ax', 'lines']) # namedtuple to hold results + setp(bp['boxes'], color=colors[0], alpha=1) + setp(bp['whiskers'], color=colors[0], alpha=1) + setp(bp['medians'], color=colors[2], alpha=1) def plot_group(keys, values, ax): keys = [com.pprint_thing(x) for x in keys] @@ -2399,7 +2569,7 @@ def plot_group(keys, values, ax): if return_type == 'dict': return bp elif return_type == 'both': - return BP(ax=ax, lines=bp) + return BoxPlot.BP(ax=ax, lines=bp) else: return ax @@ -2415,7 +2585,8 @@ def plot_group(keys, values, ax): if by is not None: result = _grouped_plot_by_column(plot_group, data, columns=columns, by=by, grid=grid, figsize=figsize, - ax=ax, layout=layout, return_type=return_type) + ax=ax, layout=layout, + return_type=return_type) else: if layout is not None: raise ValueError("The 'layout' keyword is not supported when " @@ -2455,7 +2626,8 @@ def format_date_labels(ax, rot): pass -def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, **kwargs): +def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, + **kwargs): """ Make a scatter plot from two DataFrame columns @@ -2551,12 +2723,13 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, data = data._get_numeric_data() naxes = len(data.columns) - nrows, ncols = _get_layout(naxes, layout=layout) - fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, ax=ax, squeeze=False, - sharex=sharex, sharey=sharey, figsize=figsize) + fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey, figsize=figsize, + layout=layout) + _axes = _flatten(axes) for i, col in enumerate(com._try_sort(data.columns)): - ax = axes[i // ncols, i % ncols] + ax = _axes[i] ax.hist(data[col].dropna().values, bins=bins, **kwds) ax.set_title(col) ax.grid(grid) @@ -2672,7 +2845,7 @@ def plot_group(group, ax): xrot = xrot or rot fig, axes = _grouped_plot(plot_group, data, column=column, - by=by, sharex=sharex, sharey=sharey, + by=by, sharex=sharex, sharey=sharey, ax=ax, figsize=figsize, layout=layout, rot=rot) _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, @@ -2730,9 +2903,9 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, """ if subplots is True: naxes = len(grouped) - nrows, ncols = _get_layout(naxes, layout=layout) - fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, squeeze=False, - ax=ax, sharex=False, sharey=True, figsize=figsize) + fig, axes = _subplots(naxes=naxes, squeeze=False, + ax=ax, sharex=False, sharey=True, figsize=figsize, + layout=layout) axes = _flatten(axes) ret = compat.OrderedDict() @@ -2773,14 +2946,14 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, grouped = grouped[column] naxes = len(grouped) - nrows, ncols = _get_layout(naxes, layout=layout) - fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, - figsize=figsize, sharex=sharex, sharey=sharey, ax=ax) + fig, axes = _subplots(naxes=naxes, figsize=figsize, + sharex=sharex, sharey=sharey, ax=ax, + layout=layout) - ravel_axes = _flatten(axes) + _axes = _flatten(axes) for i, (key, group) in enumerate(grouped): - ax = ravel_axes[i] + ax = _axes[i] if numeric_only and isinstance(group, DataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -2799,16 +2972,14 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, by = [by] columns = data._get_numeric_data().columns - by naxes = len(columns) - nrows, ncols = _get_layout(naxes, layout=layout) - fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, - sharex=True, sharey=True, - figsize=figsize, ax=ax) + fig, axes = _subplots(naxes=naxes, sharex=True, sharey=True, + figsize=figsize, ax=ax, layout=layout) - ravel_axes = _flatten(axes) + _axes = _flatten(axes) result = compat.OrderedDict() for i, col in enumerate(columns): - ax = ravel_axes[i] + ax = _axes[i] gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, **kwargs) @@ -2869,39 +3040,54 @@ def table(ax, data, rowLabels=None, colLabels=None, return table -def _get_layout(nplots, layout=None): +def _get_layout(nplots, layout=None, layout_type='box'): if layout is not None: if not isinstance(layout, (tuple, list)) or len(layout) != 2: raise ValueError('Layout must be a tuple of (rows, columns)') nrows, ncols = layout + + # Python 2 compat + ceil_ = lambda x: int(ceil(x)) + if nrows == -1 and ncols >0: + layout = nrows, ncols = (ceil_(float(nplots) / ncols), ncols) + elif ncols == -1 and nrows > 0: + layout = nrows, ncols = (nrows, ceil_(float(nplots) / nrows)) + elif ncols <= 0 and nrows <= 0: + msg = "At least one dimension of layout must be positive" + raise ValueError(msg) + if nrows * ncols < nplots: raise ValueError('Layout of %sx%s must be larger than required size %s' % (nrows, ncols, nplots)) return layout - if nplots == 1: + if layout_type == 'single': return (1, 1) - elif nplots == 2: - return (1, 2) - elif nplots < 4: - return (2, 2) + elif layout_type == 'horizontal': + return (1, nplots) + elif layout_type == 'vertical': + return (nplots, 1) - k = 1 - while k ** 2 < nplots: - k += 1 - - if (k - 1) * k >= nplots: - return k, (k - 1) - else: - return k, k + layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)} + try: + return layouts[nplots] + except KeyError: + k = 1 + while k ** 2 < nplots: + k += 1 + + if (k - 1) * k >= nplots: + return k, (k - 1) + else: + return k, k -# copied from matplotlib/pyplot.py for compatibility with matplotlib < 1.0 +# copied from matplotlib/pyplot.py and modified for pandas.plotting -def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze=True, - subplot_kw=None, ax=None, **fig_kw): +def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, + subplot_kw=None, ax=None, layout=None, layout_type='box', **fig_kw): """Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of @@ -2909,12 +3095,6 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= Keyword arguments: - nrows : int - Number of rows of the subplot grid. Defaults to 1. - - ncols : int - Number of columns of the subplot grid. Defaults to 1. - naxes : int Number of required axes. Exceeded axes are set invisible. Default is nrows * ncols. @@ -2942,11 +3122,17 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= ax : Matplotlib axis object, optional + layout : tuple + Number of rows and columns of the subplot grid. + If not specified, calculated from naxes and layout_type + + layout_type : {'box', 'horziontal', 'vertical'}, default 'box' + Specify how to layout the subplot grid. + fig_kw : Other keyword arguments to be passed to the figure() call. Note that all keywords not recognized above will be automatically included here. - Returns: fig, ax : tuple @@ -2975,23 +3161,27 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt - from pandas.core.frame import DataFrame if subplot_kw is None: subplot_kw = {} - # Create empty object array to hold all axes. It's easiest to make it 1-d - # so we can just append subplots upon creation, and then - nplots = nrows * ncols - - if naxes is None: - naxes = nrows * ncols - elif nplots < naxes: - raise ValueError("naxes {0} is larger than layour size defined by nrows * ncols".format(naxes)) - if ax is None: fig = plt.figure(**fig_kw) else: + if com.is_list_like(ax): + ax = _flatten(ax) + if layout is not None: + warnings.warn("When passing multiple axes, layout keyword is ignored", UserWarning) + if sharex or sharey: + warnings.warn("When passing multiple axes, sharex and sharey are ignored." + "These settings must be specified when creating axes", UserWarning) + if len(ax) == naxes: + fig = ax[0].get_figure() + return fig, ax + else: + raise ValueError("The number of passed axes must be {0}, the same as " + "the output plot".format(naxes)) + fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is if naxes == 1: @@ -3004,6 +3194,11 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= "is being cleared", UserWarning) fig.clear() + nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) + nplots = nrows * ncols + + # Create empty object array to hold all axes. It's easiest to make it 1-d + # so we can just append subplots upon creation, and then axarr = np.empty(nplots, dtype=object) # Create first subplot separately, so we can share it if requested @@ -3018,7 +3213,14 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= # Note off-by-one counting because add_subplot uses the MATLAB 1-based # convention. for i in range(1, nplots): - ax = fig.add_subplot(nrows, ncols, i + 1, **subplot_kw) + kwds = subplot_kw.copy() + # Set sharex and sharey to None for blank/dummy axes, these can + # interfere with proper axis limits on the visible axes if + # they share axes e.g. issue #7528 + if i >= naxes: + kwds['sharex'] = None + kwds['sharey'] = None + ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax if nplots > 1: @@ -3074,10 +3276,10 @@ def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze= def _flatten(axes): if not com.is_list_like(axes): - axes = [axes] + return np.array([axes]) elif isinstance(axes, (np.ndarray, Index)): - axes = axes.ravel() - return axes + return axes.ravel() + return np.array(axes) def _get_all_lines(ax): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 749f15af0d916..b9c7fdfeb6c48 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -14,7 +14,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tools.merge import merge, concat, ordered_merge, MergeError from pandas.util.testing import (assert_frame_equal, assert_series_equal, - assert_almost_equal, rands, + assert_almost_equal, makeCustomDataframe as mkdf, assertRaisesRegexp) from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv @@ -224,12 +224,12 @@ def test_join_on(self): self.assertTrue(np.isnan(joined['three']['c'])) # merge column not p resent - self.assertRaises(Exception, target.join, source, on='E') + self.assertRaises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 - self.assertRaises(Exception, target.join, source_copy, on='A') + self.assertRaises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with tm.assertRaises(ValueError): @@ -387,7 +387,7 @@ def test_join_multiindex(self): df2 = df2.sortlevel(0) joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index + index2._tuple_index + ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) @@ -397,7 +397,7 @@ def test_join_multiindex(self): df2 = df2.sortlevel(1) joined = df1.join(df2, how='outer').sortlevel(0) - ex_index = index1._tuple_index + index2._tuple_index + ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names @@ -551,15 +551,15 @@ def test_merge_index_singlekey_inner(self): assert_frame_equal(result, expected.ix[:, result.columns]) def test_merge_misspecified(self): - self.assertRaises(Exception, merge, self.left, self.right, + self.assertRaises(ValueError, merge, self.left, self.right, left_index=True) - self.assertRaises(Exception, merge, self.left, self.right, + self.assertRaises(ValueError, merge, self.left, self.right, right_index=True) - self.assertRaises(Exception, merge, self.left, self.left, + self.assertRaises(ValueError, merge, self.left, self.left, left_on='key', on='key') - self.assertRaises(Exception, merge, self.df, self.df2, + self.assertRaises(ValueError, merge, self.df, self.df2, left_on=['key1'], right_on=['key1', 'key2']) def test_merge_overlap(self): @@ -854,7 +854,7 @@ def test_overlapping_columns_error_message(self): df.columns = ['key', 'foo', 'foo'] df2.columns = ['key', 'bar', 'bar'] - self.assertRaises(Exception, merge, df, df2) + self.assertRaises(ValueError, merge, df, df2) def _check_merge(x, y): for how in ['inner', 'left', 'outer']: @@ -913,7 +913,7 @@ def test_merge_right_vs_left(self): def test_compress_group_combinations(self): # ~ 40000000 possible unique groups - key1 = np.array([rands(10) for _ in range(10000)], dtype='O') + key1 = tm.rands_array(10, 10000) key1 = np.tile(key1, 2) key2 = key1[::-1] @@ -967,6 +967,98 @@ def test_left_join_index_preserve_order(self): right_on=['k1', 'k2'], how='right') tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + def test_left_join_index_multi_match_multiindex(self): + left = DataFrame([ + ['X', 'Y', 'C', 'a'], + ['W', 'Y', 'C', 'e'], + ['V', 'Q', 'A', 'h'], + ['V', 'R', 'D', 'i'], + ['X', 'Y', 'D', 'b'], + ['X', 'Y', 'A', 'c'], + ['W', 'Q', 'B', 'f'], + ['W', 'R', 'C', 'g'], + ['V', 'Y', 'C', 'j'], + ['X', 'Y', 'B', 'd']], + columns=['cola', 'colb', 'colc', 'tag'], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) + + right = DataFrame([ + ['W', 'R', 'C', 0], + ['W', 'Q', 'B', 3], + ['W', 'Q', 'B', 8], + ['X', 'Y', 'A', 1], + ['X', 'Y', 'A', 4], + ['X', 'Y', 'B', 5], + ['X', 'Y', 'C', 6], + ['X', 'Y', 'C', 9], + ['X', 'Q', 'C', -6], + ['X', 'R', 'C', -9], + ['V', 'Y', 'C', 7], + ['V', 'R', 'D', 2], + ['V', 'R', 'D', -1], + ['V', 'Q', 'A', -3]], + columns=['col1', 'col2', 'col3', 'val']) + + right.set_index(['col1', 'col2', 'col3'], inplace=True) + result = left.join(right, on=['cola', 'colb', 'colc'], how='left') + + expected = DataFrame([ + ['X', 'Y', 'C', 'a', 6], + ['X', 'Y', 'C', 'a', 9], + ['W', 'Y', 'C', 'e', nan], + ['V', 'Q', 'A', 'h', -3], + ['V', 'R', 'D', 'i', 2], + ['V', 'R', 'D', 'i', -1], + ['X', 'Y', 'D', 'b', nan], + ['X', 'Y', 'A', 'c', 1], + ['X', 'Y', 'A', 'c', 4], + ['W', 'Q', 'B', 'f', 3], + ['W', 'Q', 'B', 'f', 8], + ['W', 'R', 'C', 'g', 0], + ['V', 'Y', 'C', 'j', 7], + ['X', 'Y', 'B', 'd', 5]], + columns=['cola', 'colb', 'colc', 'tag', 'val'], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match(self): + left = DataFrame([ + ['c', 0], + ['b', 1], + ['a', 2], + ['b', 3]], + columns=['tag', 'val'], + index=[2, 0, 1, 3]) + + right = DataFrame([ + ['a', 'v'], + ['c', 'w'], + ['c', 'x'], + ['d', 'y'], + ['a', 'z'], + ['c', 'r'], + ['e', 'q'], + ['c', 's']], + columns=['tag', 'char']) + + right.set_index('tag', inplace=True) + result = left.join(right, on='tag', how='left') + + expected = DataFrame([ + ['c', 0, 'w'], + ['c', 0, 'x'], + ['c', 0, 'r'], + ['c', 0, 's'], + ['b', 1, nan], + ['a', 2, 'v'], + ['a', 2, 'z'], + ['b', 3, nan]], + columns=['tag', 'val', 'char'], + index=[2, 2, 2, 2, 0, 1, 1, 3]) + + tm.assert_frame_equal(result, expected) + def test_join_multi_dtypes(self): # test with multi dtypes in the join index @@ -1393,6 +1485,38 @@ def test_append_missing_column_proper_upcast(self): self.assertEqual(appended['A'].dtype, 'f8') self.assertEqual(appended['B'].dtype, 'O') + def test_concat_copy(self): + + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randint(0,10,size=4).reshape(4,1)) + df3 = DataFrame({5 : 'foo'},index=range(4)) + + # these are actual copies + result = concat([df,df2,df3],axis=1,copy=True) + for b in result._data.blocks: + self.assertIsNone(b.values.base) + + # these are the same + result = concat([df,df2,df3],axis=1,copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertTrue(b.values.base is df._data.blocks[0].values.base) + elif b.is_integer: + self.assertTrue(b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + + # float block was consolidated + df4 = DataFrame(np.random.randn(4,1)) + result = concat([df,df2,df3,df4],axis=1,copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertIsNone(b.values.base) + elif b.is_integer: + self.assertTrue(b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) @@ -1998,7 +2122,7 @@ def test_concat_exclude_none(self): pieces = [df[:5], None, None, df[5:]] result = concat(pieces) tm.assert_frame_equal(result, df) - self.assertRaises(Exception, concat, [None, None]) + self.assertRaises(ValueError, concat, [None, None]) def test_concat_datetime64_block(self): from pandas.tseries.index import date_range diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 7e52c8c333dbf..23350b203ee50 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -266,6 +266,34 @@ def _check_output(res, col, index=['A', 'B'], columns=['C']): gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) + # issue number #8349: pivot_table with margins and dictionary aggfunc + + df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, + {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, + {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, + {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ]) + + df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False) + + rs=df.pivot_table( index=['JOB','NAME'], + columns=['YEAR','MONTH'], + values=['DAYS','SALARY'], + aggfunc={'DAYS':'mean','SALARY':'sum'}, + margins=True) + + ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True) + + tm.assert_frame_equal(rs['DAYS'], ex['DAYS']) + + ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True) + + tm.assert_frame_equal(rs['SALARY'], ex['SALARY']) + + + def test_pivot_integer_columns(self): # caused by upstream bug in unstack @@ -527,6 +555,50 @@ def test_pivot_datetime_tz(self): aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected) + def test_pivot_dtaccessor(self): + # GH 8103 + dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', + '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'dt1': dates1, 'dt2': dates2, + 'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3}) + df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) + df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) + + result = pivot_table(df, index='label', columns=df['dt1'].dt.hour, + values='value1') + + exp_idx = Index(['a', 'b'], name='label') + expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, + index=exp_idx, columns=[7, 8, 9]) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour, + values='value1') + + expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, + index=[1, 2], columns=[7, 8, 9]) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=df['dt2'].dt.year, + columns=[df['dt1'].dt.hour, df['dt2'].dt.month], + values='value1') + + exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3]) + expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]],dtype='int64'), + index=[2013], columns=exp_col) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=np.array(['X', 'X', 'X', 'X', 'Y', 'Y']), + columns=[df['dt1'].dt.hour, df['dt2'].dt.month], + values='value1') + expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], + [np.nan, np.nan, np.nan, 4, np.nan, 5]]), + index=['X', 'Y'], columns=exp_col) + tm.assert_frame_equal(result, expected) + class TestCrosstab(tm.TestCase): diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 7390a4b11095b..3bdd49673ca71 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -76,12 +76,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -95,7 +95,7 @@ def test_label_precision(self): result = cut(arr, 4, precision=2) ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]'] - self.assert_numpy_array_equal(result.levels, ex_levels) + self.assert_numpy_array_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -120,10 +120,10 @@ def test_inf_handling(self): result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) - ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]'] + ex_categories = ['(-inf, 2]', '(2, 4]', '(4, inf]'] - np.testing.assert_array_equal(result.levels, ex_levels) - np.testing.assert_array_equal(result_ser.levels, ex_levels) + np.testing.assert_array_equal(result.categories, ex_categories) + np.testing.assert_array_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], '(4, inf]') self.assertEqual(result[0], '(-inf, 2]') self.assertEqual(result_ser[5], '(4, inf]') @@ -172,7 +172,7 @@ def test_cut_pass_labels(self): result = cut(arr, bins, labels=labels) exp = cut(arr, bins) - exp.levels = labels + exp.categories = labels self.assertTrue(result.equals(exp)) @@ -182,7 +182,7 @@ def test_qcut_include_lowest(self): cats = qcut(values, 4) ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]'] - self.assertTrue((cats.levels == ex_levels).all()) + self.assertTrue((cats.categories == ex_levels).all()) def test_qcut_nas(self): arr = np.random.randn(100) @@ -216,7 +216,7 @@ def test_qcut_binning_issues(self): starts = [] ends = [] - for lev in result.levels: + for lev in result.categories: s, e = lev[1:-1].split(',') self.assertTrue(s != e) @@ -230,6 +230,25 @@ def test_qcut_binning_issues(self): self.assertTrue(ep < en) self.assertTrue(ep <= sn) + def test_cut_return_categorical(self): + from pandas import Categorical + s = Series([0,1,2,3,4,5,6,7,8]) + res = cut(s,3) + exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], + ["(-0.008, 2.667]", "(2.667, 5.333]", "(5.333, 8]"], + ordered=True)) + tm.assert_series_equal(res, exp) + + def test_qcut_return_categorical(self): + from pandas import Categorical + s = Series([0,1,2,3,4,5,6,7,8]) + res = qcut(s,[0,0.333,0.666,1]) + exp = Series(Categorical.from_codes([0,0,0,1,1,1,2,2,2], + ["[0, 2.664]", "(2.664, 5.328]", "(5.328, 8]"], + ordered=True)) + tm.assert_series_equal(res, exp) + + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tests/test_tools.py b/pandas/tools/tests/test_tools.py deleted file mode 100644 index 4fd70e28497a8..0000000000000 --- a/pandas/tools/tests/test_tools.py +++ /dev/null @@ -1,23 +0,0 @@ -from pandas import DataFrame -from pandas.tools.describe import value_range - -import numpy as np -import pandas.util.testing as tm - - -class TestTools(tm.TestCase): - - def test_value_range(self): - df = DataFrame(np.random.randn(5, 5)) - df.ix[0, 2] = -5 - df.ix[2, 0] = 5 - - res = value_range(df) - - self.assertEqual(res['Minimum'], -5) - self.assertEqual(res['Maximum'], 5) - - df.ix[0, 1] = np.NaN - - self.assertEqual(res['Minimum'], -5) - self.assertEqual(res['Maximum'], 5) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b28f7c89606de..5eddd2f8dec33 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -34,7 +34,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, right == True (the default), then the bins [1,2,3,4] indicate (1,2], (2,3], (3,4]. labels : array or boolean, default None - Labels to use for bin edges, or False to return integer bin labels + Used as labels for the resulting bins. Must be of the same length as the resulting + bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -45,7 +46,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- - out : Categorical or array of integers if labels is False + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series of type category if + input is a Series else Categorical. Bins are represented as categories when categorical + data is returned. bins : ndarray of floats Returned only if `retbins` is True. @@ -61,12 +65,15 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- - >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], - (6.533, 9.7], (0.191, 3.367]], dtype=object), - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) - >>> cut(np.ones(5), 4, labels=False) - array([2, 2, 2, 2, 2]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], (6.533, 9.7], (0.191, 3.367]] + Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], + array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"]) + [good, good, good, medium, bad, good] + Categories (3, object): [good < medium < bad] + >>> pd.cut(np.ones(5), 4, labels=False) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 if not np.iterable(bins): @@ -102,9 +109,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, - include_lowest=include_lowest) + res = _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision, + include_lowest=include_lowest) + if isinstance(x, Series): + res = Series(res, index=x.index) + return res + def qcut(x, q, labels=None, retbins=False, precision=3): @@ -121,7 +131,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None - Labels to use for bin edges, or False to return integer bin labels + Used as labels for the resulting bins. Must be of the same length as the resulting + bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -130,7 +141,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Returns ------- - cat : Categorical + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series of type category if + input is a Series else Categorical. Bins are represented as categories when categorical + data is returned. + bins : ndarray of floats + Returned only if `retbins` is True. Notes ----- @@ -138,14 +154,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Examples -------- + >>> pd.qcut(range(5), 4) + [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] + Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) + [good, good, medium, bad, bad] + Categories (3, object): [good < medium < bad] + >>> pd.qcut(range(5), 4, labels=False) + array([0, 0, 1, 2, 3], dtype=int64) """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + res = _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision, + include_lowest=True) + if isinstance(x, Series): + res = Series(res, index=x.index) + return res + def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, @@ -189,7 +217,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, name=name, fastpath=True) + fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) else: fac = ids - 1 if has_nas: diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 215a76b84452a..72fdeaff36ef1 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,4 +1,5 @@ import operator +import warnings from pandas.compat import reduce from pandas.core.index import Index import numpy as np @@ -47,3 +48,22 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) + +### FIXME: remove in 0.16 +def value_range(df): + """ + Return the minimum and maximum of a dataframe in a series object + + Parameters + ---------- + df : DataFrame + + Returns + ------- + (maximum, minimum) : Series + + """ + from pandas import Series + warnings.warn("value_range is deprecated. Use .describe() instead", FutureWarning) + + return Series((min(df.min()), max(df.max())), ('Minimum', 'Maximum')) diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index c2cc3723802fc..7c47bd9a232a9 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -5,6 +5,7 @@ from pandas.tseries.index import DatetimeIndex, date_range, bdate_range from pandas.tseries.frequencies import infer_freq +from pandas.tseries.tdi import Timedelta, TimedeltaIndex, timedelta_range from pandas.tseries.period import Period, PeriodIndex, period_range, pnow from pandas.tseries.resample import TimeGrouper from pandas.tseries.timedeltas import to_timedelta diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py new file mode 100644 index 0000000000000..78ed19a7049a5 --- /dev/null +++ b/pandas/tseries/base.py @@ -0,0 +1,473 @@ +""" +Base and utility classes for tseries type pandas objects. +""" + + +from datetime import datetime, time, timedelta + +from pandas import compat +import numpy as np +from pandas.core import common as com +import pandas.tslib as tslib +import pandas.lib as lib +from pandas.core.index import Index +from pandas.util.decorators import Appender, cache_readonly +from pandas.tseries.frequencies import ( + infer_freq, to_offset, get_period_alias, + Resolution) +import pandas.algos as _algos + +class DatetimeIndexOpsMixin(object): + """ common ops mixin to support a unified inteface datetimelike Index """ + + def __iter__(self): + return (self._box_func(v) for v in self.asi8) + + @staticmethod + def _join_i8_wrapper(joinf, dtype, with_indexers=True): + """ create the join wrapper methods """ + + @staticmethod + def wrapper(left, right): + if isinstance(left, (np.ndarray, com.ABCIndex, com.ABCSeries)): + left = left.view('i8') + if isinstance(right, (np.ndarray, com.ABCIndex, com.ABCSeries)): + right = right.view('i8') + results = joinf(left, right) + if with_indexers: + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results + + return wrapper + + @property + def _box_func(self): + """ + box function to get object from internal representation + """ + raise NotImplementedError + + def _box_values(self, values): + """ + apply box func to passed values + """ + return lib.map_infer(values, self._box_func) + + def groupby(self, f): + objs = self.asobject.values + return _algos.groupby_object(objs, f) + + def _format_with_header(self, header, **kwargs): + return header + self._format_native_types(**kwargs) + + def __contains__(self, key): + try: + res = self.get_loc(key) + return np.isscalar(res) or type(res) == slice + except (KeyError, TypeError): + return False + + @cache_readonly + def inferred_freq(self): + try: + return infer_freq(self) + except ValueError: + return None + + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + def map(self, f): + try: + result = f(self) + if not isinstance(result, (np.ndarray, Index)): + raise TypeError + return result + except Exception: + return _algos.arrmap_object(self.asobject.values, f) + + def order(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + if return_indexer: + _as = self.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + return sorted_index, _as + else: + sorted_values = np.sort(self.values) + if not ascending: + sorted_values = sorted_values[::-1] + attribs = self._get_attributes_dict() + attribs['freq'] = None + return self._simple_new(sorted_values, **attribs) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + """ + maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) + if isinstance(maybe_slice, slice): + return self[maybe_slice] + return super(DatetimeIndexOpsMixin, self).take(indices, axis) + + def slice_locs(self, start=None, end=None): + """ + Index.slice_locs, customized to handle partial ISO-8601 string slicing + """ + if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): + + if self.is_monotonic: + try: + if start: + start_loc = self._get_string_slice(start).start + else: + start_loc = 0 + + if end: + end_loc = self._get_string_slice(end).stop + else: + end_loc = len(self) + + return start_loc, end_loc + except KeyError: + pass + + else: + # can't use a slice indexer because we are not sorted! + # so create an indexer directly + try: + if start: + start_loc = self._get_string_slice(start, + use_rhs=False) + else: + start_loc = np.arange(len(self)) + + if end: + end_loc = self._get_string_slice(end, use_lhs=False) + else: + end_loc = np.arange(len(self)) + + return start_loc, end_loc + except KeyError: + pass + + if isinstance(start, time) or isinstance(end, time): + raise KeyError('Cannot use slice_locs with time slice keys') + + return Index.slice_locs(self, start, end) + + def get_duplicates(self): + values = Index.get_duplicates(self) + return self._simple_new(values) + + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + return (self.asi8 == tslib.iNaT).any() + + @property + def asobject(self): + from pandas.core.index import Index + return Index(self._box_values(self.asi8), name=self.name, dtype=object) + + def tolist(self): + """ + return a list of the underlying data + """ + return list(self.asobject) + + def min(self, axis=None): + """ + return the minimum value of the Index + + See also + -------- + numpy.ndarray.min + """ + try: + i8 = self.asi8 + + # quick check + if len(i8) and self.is_monotonic: + if i8[0] != tslib.iNaT: + return self._box_func(i8[0]) + + if self.hasnans: + mask = i8 == tslib.iNaT + min_stamp = self[~mask].asi8.min() + else: + min_stamp = i8.min() + return self._box_func(min_stamp) + except ValueError: + return self._na_value + + def argmin(self, axis=None): + """ + return a ndarray of the minimum argument indexer + + See also + -------- + numpy.ndarray.argmin + """ + + i8 = self.asi8 + if self.hasnans: + mask = i8 == tslib.iNaT + if mask.all(): + return -1 + i8 = i8.copy() + i8[mask] = np.iinfo('int64').max + return i8.argmin() + + def max(self, axis=None): + """ + return the maximum value of the Index + + See also + -------- + numpy.ndarray.max + """ + try: + i8 = self.asi8 + + # quick check + if len(i8) and self.is_monotonic: + if i8[-1] != tslib.iNaT: + return self._box_func(i8[-1]) + + if self.hasnans: + mask = i8 == tslib.iNaT + max_stamp = self[~mask].asi8.max() + else: + max_stamp = i8.max() + return self._box_func(max_stamp) + except ValueError: + return self._na_value + + def argmax(self, axis=None): + """ + return a ndarray of the maximum argument indexer + + See also + -------- + numpy.ndarray.argmax + """ + + i8 = self.asi8 + if self.hasnans: + mask = i8 == tslib.iNaT + if mask.all(): + return -1 + i8 = i8.copy() + i8[mask] = 0 + return i8.argmax() + + @property + def _formatter_func(self): + """ + Format function to convert value to representation + """ + return str + + def _format_footer(self): + raise NotImplementedError + + def __unicode__(self): + formatter = self._formatter_func + summary = str(self.__class__) + '\n' + + n = len(self) + if n == 0: + pass + elif n == 1: + first = formatter(self[0]) + summary += '[%s]\n' % first + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary += '[%s, %s]\n' % (first, last) + else: + first = formatter(self[0]) + last = formatter(self[-1]) + summary += '[%s, ..., %s]\n' % (first, last) + + summary += self._format_footer() + return summary + + @cache_readonly + def _resolution(self): + from pandas.tseries.frequencies import Resolution + return Resolution.get_reso_from_freq(self.freqstr) + + @cache_readonly + def resolution(self): + """ + Returns day, hour, minute, second, millisecond or microsecond + """ + from pandas.tseries.frequencies import get_reso_string + return get_reso_string(self._resolution) + + def _add_datelike(self, other): + raise NotImplementedError + + def _sub_datelike(self, other): + raise NotImplementedError + + def __add__(self, other): + from pandas.core.index import Index + from pandas.tseries.tdi import TimedeltaIndex + from pandas.tseries.offsets import DateOffset + if isinstance(other, TimedeltaIndex): + return self._add_delta(other) + elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): + if hasattr(other,'_add_delta'): + return other._add_delta(self) + raise TypeError("cannot add TimedeltaIndex and {typ}".format(typ=type(other))) + elif isinstance(other, Index): + return self.union(other) + elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): + return self._add_delta(other) + elif com.is_integer(other): + return self.shift(other) + elif isinstance(other, (tslib.Timestamp, datetime)): + return self._add_datelike(other) + else: # pragma: no cover + return NotImplemented + + def __sub__(self, other): + from pandas.core.index import Index + from pandas.tseries.tdi import TimedeltaIndex + from pandas.tseries.offsets import DateOffset + if isinstance(other, TimedeltaIndex): + return self._add_delta(-other) + elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): + if not isinstance(other, TimedeltaIndex): + raise TypeError("cannot subtract TimedeltaIndex and {typ}".format(typ=type(other))) + return self._add_delta(-other) + elif isinstance(other, Index): + return self.difference(other) + elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): + return self._add_delta(-other) + elif com.is_integer(other): + return self.shift(-other) + elif isinstance(other, (tslib.Timestamp, datetime)): + return self._sub_datelike(other) + else: # pragma: no cover + return NotImplemented + + __iadd__ = __add__ + __isub__ = __sub__ + + def _add_delta(self, other): + return NotImplemented + + def _add_delta_td(self, other): + # add a delta of a timedeltalike + # return the i8 result view + + inc = tslib._delta_to_nanoseconds(other) + mask = self.asi8 == tslib.iNaT + new_values = (self.asi8 + inc).view(self.dtype) + new_values[mask] = tslib.iNaT + return new_values.view(self.dtype) + + def _add_delta_tdi(self, other): + # add a delta of a TimedeltaIndex + # return the i8 result view + + # delta operation + if not len(self) == len(other): + raise ValueError("cannot add indices of unequal length") + + self_i8 = self.asi8 + other_i8 = other.asi8 + mask = (self_i8 == tslib.iNaT) | (other_i8 == tslib.iNaT) + new_values = self_i8 + other_i8 + new_values[mask] = tslib.iNaT + return new_values.view(self.dtype) + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + if not isinstance(values, type(self)): + try: + values = type(self)(values) + except ValueError: + return self.asobject.isin(values) + + value_set = set(values.asi8) + return lib.ismember(self.asi8, value_set) + + def shift(self, n, freq=None): + """ + Specialized shift which produces a DatetimeIndex + + Parameters + ---------- + n : int + Periods to shift by + freq : DateOffset or timedelta-like, optional + + Returns + ------- + shifted : DatetimeIndex + """ + if freq is not None and freq != self.freq: + if isinstance(freq, compat.string_types): + freq = to_offset(freq) + result = Index.shift(self, n, freq) + + if hasattr(self,'tz'): + result.tz = self.tz + + return result + + if n == 0: + # immutable so OK + return self + + if self.freq is None: + raise ValueError("Cannot shift with no freq") + + start = self[0] + n * self.freq + end = self[-1] + n * self.freq + attribs = self._get_attributes_dict() + attribs['start'] = start + attribs['end'] = end + return type(self)(**attribs) + + def unique(self): + """ + Index.unique with handling for DatetimeIndex/PeriodIndex metadata + + Returns + ------- + result : DatetimeIndex or PeriodIndex + """ + from pandas.core.index import Int64Index + result = Int64Index.unique(self) + return self._simple_new(result, name=self.name, freq=self.freq, + tz=getattr(self, 'tz', None)) + + def repeat(self, repeats, axis=None): + """ + Analogous to ndarray.repeat + """ + return self._simple_new(self.values.repeat(repeats), + name=self.name) + + diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 92ccd1248fac9..227af42f07411 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -3,7 +3,7 @@ import numpy as np from pandas.core.base import PandasDelegate from pandas.core import common as com -from pandas import Series, DatetimeIndex, PeriodIndex +from pandas import Series, DatetimeIndex, PeriodIndex, TimedeltaIndex from pandas import lib, tslib def is_datetimelike(data): @@ -17,7 +17,8 @@ def is_datetimelike(data): def maybe_to_datetimelike(data, copy=False): """ - return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns] dtype or a Series of Periods) + return a DelegatedClass of a Series that is datetimelike + (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters @@ -37,16 +38,14 @@ def maybe_to_datetimelike(data, copy=False): index = data.index if issubclass(data.dtype.type, np.datetime64): - return DatetimeProperties(DatetimeIndex(data, copy=copy), index) + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index) + elif issubclass(data.dtype.type, np.timedelta64): + return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index) else: - - if isinstance(data, PeriodIndex): + if com.is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index) - - data = com._values_from_object(data) - inferred = lib.infer_dtype(data) - if inferred == 'period': - return PeriodProperties(PeriodIndex(data), index) + if com.is_datetime_arraylike(data): + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) @@ -63,6 +62,8 @@ def _delegate_property_get(self, name): if isinstance(result, np.ndarray): if com.is_integer_dtype(result): result = result.astype('int64') + elif not com.is_list_like(result): + return result # return the result as a Series, which is by definition a copy result = Series(result, index=self.index) @@ -77,6 +78,21 @@ def _delegate_property_set(self, name, value, *args, **kwargs): raise ValueError("modifications to a property of a datetimelike object are not " "supported. Change values on the original.") + def _delegate_method(self, name, *args, **kwargs): + method = getattr(self.values, name) + result = method(*args, **kwargs) + + if not com.is_list_like(result): + return result + + result = Series(result, index=self.index) + + # setting this object will show a SettingWithCopyWarning/Error + result.is_copy = ("modifications to a method of a datetimelike object are not " + "supported and are discarded. Change values on the original.") + + return result + class DatetimeProperties(Properties): """ @@ -92,9 +108,42 @@ class DatetimeProperties(Properties): Raises TypeError if the Series does not contain datetimelike values. """ + def to_pydatetime(self): + return self.values.to_pydatetime() + DatetimeProperties._add_delegate_accessors(delegate=DatetimeIndex, accessors=DatetimeIndex._datetimelike_ops, typ='property') +DatetimeProperties._add_delegate_accessors(delegate=DatetimeIndex, + accessors=["to_period","tz_localize","tz_convert"], + typ='method') + +class TimedeltaProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Examples + -------- + >>> s.dt.hours + >>> s.dt.seconds + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + """ + + def to_pytimedelta(self): + return self.values.to_pytimedelta() + + @property + def components(self): + return self.values.components + +TimedeltaProperties._add_delegate_accessors(delegate=TimedeltaIndex, + accessors=TimedeltaIndex._datetimelike_ops, + typ='property') +TimedeltaProperties._add_delegate_accessors(delegate=TimedeltaIndex, + accessors=["to_pytimedelta"], + typ='method') class PeriodProperties(Properties): """ diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index edc7b075da6f8..7cd286129e936 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -661,12 +661,17 @@ def infer_freq(index, warn=True): if isinstance(index, com.ABCSeries): values = index.values - if not (com.is_datetime64_dtype(index.values) or values.dtype == object): + if not (com.is_datetime64_dtype(index.values) or com.is_timedelta64_dtype(index.values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype)) index = values - if isinstance(index, pd.PeriodIndex): + + if com.is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq.") + "instead of using infer_freq.") + elif isinstance(index, pd.TimedeltaIndex): + inferer = _TimedeltaFrequencyInferer(index, warn=warn) + return inferer.get_freq() + if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) @@ -693,8 +698,9 @@ def __init__(self, index, warn=True): self.index = index self.values = np.asarray(index).view('i8') - if index.tz is not None: - self.values = tslib.tz_convert(self.values, 'UTC', index.tz) + if hasattr(index,'tz'): + if index.tz is not None: + self.values = tslib.tz_convert(self.values, 'UTC', index.tz) self.warn = warn @@ -891,6 +897,18 @@ def _get_wom_rule(self): import pandas.core.algorithms as algos +class _TimedeltaFrequencyInferer(_FrequencyInferer): + + def _infer_daily_rule(self): + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + alias = _weekday_rule_aliases[self.rep_stamp.weekday()] + return _maybe_add_count('W-%s' % alias, days / 7) + else: + return _maybe_add_count('D', days) + def _maybe_add_count(base, count): if count > 1: diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index f42ad174b8f0f..3b3542b760d6f 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -4,6 +4,7 @@ from dateutil.relativedelta import MO, TU, WE, TH, FR, SA, SU from pandas.tseries.offsets import Easter, Day + def next_monday(dt): """ If holiday falls on Saturday, use following Monday instead; @@ -116,7 +117,8 @@ class Holiday(object): for observance. """ def __init__(self, name, year=None, month=None, day=None, offset=None, - observance=None, start_date=None, end_date=None): + observance=None, start_date=None, end_date=None, + days_of_week=None): """ Parameters ---------- @@ -127,15 +129,35 @@ class from pandas.tseries.offsets computes offset from date observance: function computes when holiday is given a pandas Timestamp + days_of_week: + provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday + Monday=0,..,Sunday=6 + + Examples + -------- + >>> from pandas.tseries.holiday import Holiday, nearest_workday + >>> from pandas import DateOffset + >>> from dateutil.relativedelta import MO + >>> USMemorialDay = Holiday('MemorialDay', month=5, day=24, + offset=DateOffset(weekday=MO(1))) + >>> USLaborDay = Holiday('Labor Day', month=9, day=1, + offset=DateOffset(weekday=MO(1))) + >>> July3rd = Holiday('July 3rd', month=7, day=3,) + >>> NewYears = Holiday('New Years Day', month=1, day=1, + observance=nearest_workday), + >>> July3rd = Holiday('July 3rd', month=7, day=3, + days_of_week=(0, 1, 2, 3)) """ - self.name = name - self.year = year - self.month = month - self.day = day - self.offset = offset + self.name = name + self.year = year + self.month = month + self.day = day + self.offset = offset self.start_date = start_date - self.end_date = end_date + self.end_date = end_date self.observance = observance + assert (days_of_week is None or type(days_of_week) == tuple) + self.days_of_week = days_of_week def __repr__(self): info = '' @@ -178,16 +200,20 @@ def dates(self, start_date, end_date, return_name=False): end_date = self.end_date start_date = Timestamp(start_date) - end_date = Timestamp(end_date) + end_date = Timestamp(end_date) year_offset = DateOffset(years=1) base_date = Timestamp(datetime(start_date.year, self.month, self.day)) dates = DatetimeIndex(start=base_date, end=end_date, freq=year_offset) - holiday_dates = list(self._apply_rule(dates)) - + holiday_dates = self._apply_rule(dates) + if self.days_of_week is not None: + holiday_dates = list(filter(lambda x: x is not None and + x.dayofweek in self.days_of_week, + holiday_dates)) + else: + holiday_dates = list(filter(lambda x: x is not None, holiday_dates)) if return_name: return Series(self.name, index=holiday_dates) - return holiday_dates def _apply_rule(self, dates): @@ -207,14 +233,13 @@ def _apply_rule(self, dates): if self.observance is not None: return map(lambda d: self.observance(d), dates) - if not isinstance(self.offset, list): - offsets = [self.offset] - else: - offsets = self.offset - - for offset in offsets: - dates = list(map(lambda d: d + offset, dates)) - + if self.offset is not None: + if not isinstance(self.offset, list): + offsets = [self.offset] + else: + offsets = self.offset + for offset in offsets: + dates = list(map(lambda d: d + offset, dates)) return dates holiday_calendars = {} @@ -250,7 +275,7 @@ class AbstractHolidayCalendar(object): __metaclass__ = HolidayCalendarMetaClass rules = [] start_date = Timestamp(datetime(1970, 1, 1)) - end_date = Timestamp(datetime(2030, 12, 31)) + end_date = Timestamp(datetime(2030, 12, 31)) _holiday_cache = None def __init__(self, name=None, rules=None): @@ -290,7 +315,7 @@ def holidays(self, start=None, end=None, return_name=False): DatetimeIndex of holidays """ if self.rules is None: - raise Exception('Holiday Calendar %s does not have any '\ + raise Exception('Holiday Calendar %s does not have any ' 'rules specified' % self.name) if start is None: @@ -300,7 +325,7 @@ def holidays(self, start=None, end=None, return_name=False): end = AbstractHolidayCalendar.end_date start = Timestamp(start) - end = Timestamp(end) + end = Timestamp(end) holidays = None # If we don't have a cache or the dates are outside the prior cache, we get them again @@ -334,7 +359,7 @@ def _cache(self, values): @staticmethod def merge_class(base, other): """ - Merge holiday calendars together. The base calendar + Merge holiday calendars together. The base calendar will take precedence to other. The merge will be done based on each holiday's name. @@ -359,7 +384,7 @@ def merge_class(base, other): if not isinstance(base, list): base = [base] - base_holidays = dict([ (holiday.name,holiday) for holiday in base ]) + base_holidays = dict([(holiday.name, holiday) for holiday in base]) other_holidays.update(base_holidays) return list(other_holidays.values()) @@ -376,30 +401,29 @@ def merge(self, other, inplace=False): inplace : bool (default=False) If True set rule_table to holidays, else return array of Holidays """ - holidays = self.merge_class(self, other) + holidays = self.merge_class(self, other) if inplace: self.rules = holidays else: return holidays -USMemorialDay = Holiday('MemorialDay', month=5, day=24, - offset=DateOffset(weekday=MO(1))) -USLaborDay = Holiday('Labor Day', month=9, day=1, - offset=DateOffset(weekday=MO(1))) -USColumbusDay = Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))) +USMemorialDay = Holiday('MemorialDay', month=5, day=24, + offset=DateOffset(weekday=MO(1))) +USLaborDay = Holiday('Labor Day', month=9, day=1, + offset=DateOffset(weekday=MO(1))) +USColumbusDay = Holiday('Columbus Day', month=10, day=1, + offset=DateOffset(weekday=MO(2))) USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1, offset=DateOffset(weekday=TH(4))) USMartinLutherKingJr = Holiday('Dr. Martin Luther King Jr.', month=1, day=1, offset=DateOffset(weekday=MO(3))) -USPresidentsDay = Holiday('President''s Day', month=2, day=1, - offset=DateOffset(weekday=MO(3))) +USPresidentsDay = Holiday('President''s Day', month=2, day=1, + offset=DateOffset(weekday=MO(3))) GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)]) EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)]) - class USFederalHolidayCalendar(AbstractHolidayCalendar): """ US Federal Government Holiday Calendar based on rules specified diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 2acdcfffb7d9a..7aaec511b82bf 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,6 +6,8 @@ import numpy as np +import warnings + from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, ABCSeries) @@ -13,12 +15,12 @@ import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import ( - infer_freq, to_offset, get_period_alias, + to_offset, get_period_alias, Resolution) -from pandas.core.base import DatetimeIndexOpsMixin +from pandas.tseries.base import DatetimeIndexOpsMixin from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay from pandas.tseries.tools import parse_time_string, normalize_date -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.core.common as com import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -56,22 +58,6 @@ def f(self): return property(f) -def _join_i8_wrapper(joinf, with_indexers=True): - @staticmethod - def wrapper(left, right): - if isinstance(left, (np.ndarray, Index, ABCSeries)): - left = left.view('i8') - if isinstance(right, (np.ndarray, Index, ABCSeries)): - right = right.view('i8') - results = joinf(left, right) - if with_indexers: - join_index, left_indexer, right_indexer = results - join_index = join_index.view('M8[ns]') - return join_index, left_indexer, right_indexer - return results - return wrapper - - def _dt_index_cmp(opname, nat_result=False): """ Wrap comparison operations to convert datetime-like to datetime64 @@ -119,6 +105,17 @@ def _ensure_datetime64(other): _midnight = time(0, 0) +def _new_DatetimeIndex(cls, d): + """ This is called upon unpickling, rather than the default which doesn't have arguments + and breaks __new__ """ + + # simply set the tz + # data are already in UTC + tz = d.pop('tz',None) + result = cls.__new__(cls, **d) + result.tz = tz + return result + class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and @@ -145,12 +142,25 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): closed : string or None, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) + tz : pytz.timezone or dateutil.tz.tzfile + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + - 'infer' will attempt to infer fall dst-transition hours based on order + - bool-ndarray where True signifies a DST time, False signifies + a non-DST time (note that this flag is only applicable for ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times + infer_dst : boolean, default False (DEPRECATED) + Attempt to infer fall dst-transition hours based on order name : object Name to be stored in the index """ _typ = 'datetimeindex' _join_precedence = 10 + + def _join_i8_wrapper(joinf, **kwargs): + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', **kwargs) + _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) @@ -165,9 +175,6 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): __le__ = _dt_index_cmp('__le__') __ge__ = _dt_index_cmp('__ge__') - # structured array cache for datetime fields - _sarr_cache = None - _engine_type = _index.DatetimeEngine tz = None @@ -177,18 +184,20 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): _datetimelike_ops = ['year','month','day','hour','minute','second', 'weekofyear','week','dayofweek','weekday','dayofyear','quarter', 'date','time','microsecond','nanosecond','is_month_start','is_month_end', - 'is_quarter_start','is_quarter_end','is_year_start','is_year_end'] + 'is_quarter_start','is_quarter_end','is_year_start','is_year_end','tz','freq'] _is_numeric_dtype = False + + @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous', + mapping={True: 'infer', False: 'raise'}) def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, verify_integrity=True, normalize=False, - closed=None, **kwargs): + closed=None, ambiguous='raise', **kwargs): dayfirst = kwargs.pop('dayfirst', None) yearfirst = kwargs.pop('yearfirst', None) - infer_dst = kwargs.pop('infer_dst', False) freq_infer = False if not isinstance(freq, DateOffset): @@ -214,7 +223,7 @@ def __new__(cls, data=None, if data is None: return cls._generate(start, end, periods, name, freq, tz=tz, normalize=normalize, closed=closed, - infer_dst=infer_dst) + ambiguous=ambiguous) if not isinstance(data, (np.ndarray, Index, ABCSeries)): if np.isscalar(data): @@ -240,7 +249,7 @@ def __new__(cls, data=None, data.name = name if tz is not None: - return data.tz_localize(tz, infer_dst=infer_dst) + return data.tz_localize(tz, ambiguous=ambiguous) return data @@ -288,6 +297,8 @@ def __new__(cls, data=None, # make sure that we have a index/ndarray like (and not a Series) if isinstance(subarr, ABCSeries): subarr = subarr.values + if subarr.dtype == np.object_: + subarr = tools.to_datetime(subarr, box=False) except ValueError: # tz aware @@ -309,7 +320,7 @@ def __new__(cls, data=None, # Convert tz-naive to UTC ints = subarr.view('i8') subarr = tslib.tz_localize_to_utc(ints, tz, - infer_dst=infer_dst) + ambiguous=ambiguous) subarr = subarr.view(_NS_DTYPE) @@ -333,7 +344,7 @@ def __new__(cls, data=None, @classmethod def _generate(cls, start, end, periods, name, offset, - tz=None, normalize=False, infer_dst=False, closed=None): + tz=None, normalize=False, ambiguous='raise', closed=None): if com._count_not_none(start, end, periods) != 2: raise ValueError('Must specify two of start, end, or periods') @@ -447,7 +458,7 @@ def _generate(cls, start, end, periods, name, offset, if tz is not None and getattr(index, 'tz', None) is None: index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, - infer_dst=infer_dst) + ambiguous=ambiguous) index = index.view(_NS_DTYPE) index = cls._simple_new(index, name=name, freq=offset, tz=tz) @@ -479,7 +490,7 @@ def _local_timestamps(self): return result.take(reverse) @classmethod - def _simple_new(cls, values, name=None, freq=None, tz=None): + def _simple_new(cls, values, name=None, freq=None, tz=None, **kwargs): if not getattr(values,'dtype',None): values = np.array(values,copy=False) if values.dtype != _NS_DTYPE: @@ -583,6 +594,15 @@ def _formatter_func(self): formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: formatter(x, tz=self.tz) + def __reduce__(self): + + # we use a special reudce here because we need + # to simply set the .tz (and not reinterpret it) + + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_DatetimeIndex, (self.__class__, d), None + def __setstate__(self, state): """Necessary for making this object picklable""" if isinstance(state, dict): @@ -610,20 +630,30 @@ def __setstate__(self, state): np.ndarray.__setstate__(data, state) self._data = data + self._reset_identity() else: raise Exception("invalid pickle state") _unpickle_compat = __setstate__ + def _sub_datelike(self, other): + # subtract a datetime from myself, yielding a TimedeltaIndex + + from pandas import TimedeltaIndex + other = Timestamp(other) + i8 = self.asi8 + result = i8 - other.value + if self.hasnans: + mask = i8 == tslib.iNaT + result[mask] = tslib.iNaT + return TimedeltaIndex(result,name=self.name,copy=False) + def _add_delta(self, delta): - if isinstance(delta, (Tick, timedelta)): - inc = offsets._delta_to_nanoseconds(delta) - mask = self.asi8 == tslib.iNaT - new_values = (self.asi8 + inc).view(_NS_DTYPE) - new_values[mask] = tslib.iNaT - new_values = new_values.view(_NS_DTYPE) - elif isinstance(delta, np.timedelta64): - new_values = self.to_series() + delta + from pandas import TimedeltaIndex + if isinstance(delta, (Tick, timedelta, np.timedelta64)): + new_values = self._add_delta_td(delta) + elif isinstance(delta, TimedeltaIndex): + new_values = self._add_delta_tdi(delta) else: new_values = self.astype('O') + delta tz = 'UTC' if self.tz is not None else None @@ -633,16 +663,6 @@ def _add_delta(self, delta): result = result.tz_convert(self.tz) return result - def __contains__(self, key): - try: - res = self.get_loc(key) - return np.isscalar(res) or type(res) == slice - except (KeyError, TypeError): - return False - - def _format_with_header(self, header, **kwargs): - return header + self._format_native_types(**kwargs) - def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): data = self.asobject @@ -652,35 +672,9 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=date_format, justify='all').get_result() - def isin(self, values): - """ - Compute boolean array of whether each index value is found in the - passed set of values - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - is_contained : ndarray (boolean dtype) - """ - if not isinstance(values, DatetimeIndex): - try: - values = DatetimeIndex(values) - except ValueError: - return self.asobject.isin(values) - - value_set = set(values.asi8) - return lib.ismember(self.asi8, value_set) - def to_datetime(self, dayfirst=False): return self.copy() - def groupby(self, f): - objs = self.asobject.values - return _algos.groupby_object(objs, f) - def summary(self, name=None): if len(self) > 0: index_summary = ', %s to %s' % (com.pprint_thing(self[0]), @@ -697,9 +691,9 @@ def summary(self, name=None): return result - def get_duplicates(self): - values = Index.get_duplicates(self) - return DatetimeIndex(values) + def _format_footer(self): + tagline = 'Length: %d, Freq: %s, Timezone: %s' + return tagline % (len(self), self.freqstr, self.tz) def astype(self, dtype): dtype = np.dtype(dtype) @@ -742,11 +736,16 @@ def to_series(self, keep_tz=False): ------- Series """ - return super(DatetimeIndex, self).to_series(keep_tz=keep_tz) + from pandas import Series + return Series(self._to_embed(keep_tz), index=self, name=self.name) def _to_embed(self, keep_tz=False): - """ return an array repr of this object, potentially casting to object """ - if keep_tz and self.tz is not None and str(self.tz) != 'UTC': + """ + return an array repr of this object, potentially casting to object + + This is for internal compat + """ + if keep_tz and self.tz is not None: return self.asobject.values return self.values @@ -777,23 +776,6 @@ def to_period(self, freq=None): return PeriodIndex(self.values, name=self.name, freq=freq, tz=self.tz) - def order(self, return_indexer=False, ascending=True): - """ - Return sorted copy of Index - """ - if return_indexer: - _as = self.argsort() - if not ascending: - _as = _as[::-1] - sorted_index = self.take(_as) - return sorted_index, _as - else: - sorted_values = np.sort(self.values) - if not ascending: - sorted_values = sorted_values[::-1] - return self._simple_new(sorted_values, self.name, None, - self.tz) - def snap(self, freq='S'): """ Snap time stamps to nearest occurring frequency @@ -818,56 +800,6 @@ def snap(self, freq='S'): # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) - def shift(self, n, freq=None): - """ - Specialized shift which produces a DatetimeIndex - - Parameters - ---------- - n : int - Periods to shift by - freq : DateOffset or timedelta-like, optional - - Returns - ------- - shifted : DatetimeIndex - """ - if freq is not None and freq != self.offset: - if isinstance(freq, compat.string_types): - freq = to_offset(freq) - result = Index.shift(self, n, freq) - result.tz = self.tz - - return result - - if n == 0: - # immutable so OK - return self - - if self.offset is None: - raise ValueError("Cannot shift with no offset") - - start = self[0] + n * self.offset - end = self[-1] + n * self.offset - return DatetimeIndex(start=start, end=end, freq=self.offset, - name=self.name, tz=self.tz) - - def repeat(self, repeats, axis=None): - """ - Analogous to ndarray.repeat - """ - return DatetimeIndex(self.values.repeat(repeats), - name=self.name) - - def take(self, indices, axis=0): - """ - Analogous to ndarray.take - """ - maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) - if isinstance(maybe_slice, slice): - return self[maybe_slice] - return super(DatetimeIndex, self).take(indices, axis) - def union(self, other): """ Specialized union for DatetimeIndex objects. If combine @@ -1326,52 +1258,6 @@ def slice_indexer(self, start=None, end=None, step=None): return Index.slice_indexer(self, start, end, step) - def slice_locs(self, start=None, end=None): - """ - Index.slice_locs, customized to handle partial ISO-8601 string slicing - """ - if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): - - if self.is_monotonic: - try: - if start: - start_loc = self._get_string_slice(start).start - else: - start_loc = 0 - - if end: - end_loc = self._get_string_slice(end).stop - else: - end_loc = len(self) - - return start_loc, end_loc - except KeyError: - pass - - else: - # can't use a slice indexer because we are not sorted! - # so create an indexer directly - try: - if start: - start_loc = self._get_string_slice(start, - use_rhs=False) - else: - start_loc = np.arange(len(self)) - - if end: - end_loc = self._get_string_slice(end, use_lhs=False) - else: - end_loc = np.arange(len(self)) - - return start_loc, end_loc - except KeyError: - pass - - if isinstance(start, time) or isinstance(end, time): - raise KeyError('Cannot use slice_locs with time slice keys') - - return Index.slice_locs(self, start, end) - def __getitem__(self, key): getitem = self._data.__getitem__ if np.isscalar(key): @@ -1398,17 +1284,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - # Try to run function on index first, and then on elements of index - # Especially important for group-by functionality - def map(self, f): - try: - result = f(self) - if not isinstance(result, (np.ndarray, Index)): - raise TypeError - return result - except Exception: - return _algos.arrmap_object(self.asobject.values, f) - # alias to offset def _get_freq(self): return self.offset @@ -1417,13 +1292,6 @@ def _set_freq(self, value): self.offset = value freq = property(fget=_get_freq, fset=_set_freq, doc="get/set the frequncy of the Index") - @cache_readonly - def inferred_freq(self): - try: - return infer_freq(self) - except ValueError: - return None - @property def freqstr(self): """ return the frequency object as a string if its set, otherwise None """ @@ -1645,7 +1513,9 @@ def tz_convert(self, tz): # No conversion since timestamps are all UTC to begin with return self._shallow_copy(tz=tz) - def tz_localize(self, tz, infer_dst=False): + @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous', + mapping={True: 'infer', False: 'raise'}) + def tz_localize(self, tz, ambiguous='raise'): """ Localize tz-naive DatetimeIndex to given time zone (using pytz/dateutil), or remove timezone from tz-aware DatetimeIndex @@ -1656,7 +1526,13 @@ def tz_localize(self, tz, infer_dst=False): Time zone for time. Corresponding timestamps would be converted to time zone of the TimeSeries. None will remove timezone holding local time. - infer_dst : boolean, default False + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + - 'infer' will attempt to infer fall dst-transition hours based on order + - bool-ndarray where True signifies a DST time, False signifies + a non-DST time (note that this flag is only applicable for ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times + infer_dst : boolean, default False (DEPRECATED) Attempt to infer fall dst-transition hours based on order Returns @@ -1671,7 +1547,9 @@ def tz_localize(self, tz, infer_dst=False): else: tz = tslib.maybe_get_tz(tz) # Convert to UTC - new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst) + + new_dates = tslib.tz_localize_to_utc(self.asi8, tz, + ambiguous=ambiguous) new_dates = new_dates.view(_NS_DTYPE) return self._shallow_copy(new_dates, tz=tz) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index cd37f4000e5a2..55aad38c10fae 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -152,6 +152,12 @@ def __add__(date): """ _cacheable = False _normalize_cache = True + _kwds_use_relativedelta = ( + 'years', 'months', 'weeks', 'days', + 'year', 'month', 'week', 'day', 'weekday', + 'hour', 'minute', 'second', 'microsecond' + ) + _use_relativedelta = False # default for prior pickles normalize = False @@ -160,21 +166,52 @@ def __init__(self, n=1, normalize=False, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds - if len(kwds) > 0: - self._offset = relativedelta(**kwds) + self._offset, self._use_relativedelta = self._determine_offset() + + def _determine_offset(self): + # timedelta is used for sub-daily plural offsets and all singular offsets + # relativedelta is used for plural offsets of daily length or more + # nanosecond(s) are handled by apply_wraps + kwds_no_nanos = dict( + (k, v) for k, v in self.kwds.items() + if k not in ('nanosecond', 'nanoseconds') + ) + use_relativedelta = False + + if len(kwds_no_nanos) > 0: + if any(k in self._kwds_use_relativedelta for k in kwds_no_nanos): + use_relativedelta = True + offset = relativedelta(**kwds_no_nanos) + else: + # sub-daily offset - use timedelta (tz-aware) + offset = timedelta(**kwds_no_nanos) else: - self._offset = timedelta(1) + offset = timedelta(1) + return offset, use_relativedelta @apply_wraps def apply(self, other): + if self._use_relativedelta: + other = as_datetime(other) + if len(self.kwds) > 0: + tzinfo = getattr(other, 'tzinfo', None) + if tzinfo is not None and self._use_relativedelta: + # perform calculation in UTC + other = other.replace(tzinfo=None) + if self.n > 0: for i in range(self.n): other = other + self._offset else: for i in range(-self.n): other = other - self._offset - return other + + if tzinfo is not None and self._use_relativedelta: + # bring tz back from UTC calculation + other = tslib._localize_pydatetime(other, tzinfo) + + return as_timestamp(other) else: return other + timedelta(self.n) @@ -188,12 +225,12 @@ def _should_cache(self): return self.isAnchored() and self._cacheable def _params(self): - attrs = [(k, v) for k, v in compat.iteritems(vars(self)) - if (k not in ['kwds', 'name', 'normalize', - 'busdaycalendar']) and (k[0] != '_')] - attrs.extend(list(self.kwds.items())) + all_paras = dict(list(vars(self).items()) + list(self.kwds.items())) + if 'holidays' in all_paras and not all_paras['holidays']: + all_paras.pop('holidays') + exclude = ['kwds', 'name','normalize', 'calendar'] + attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude ) and (k[0] != '_')] attrs = sorted(set(attrs)) - params = tuple([str(self.__class__)] + attrs) return params @@ -510,38 +547,57 @@ class CustomBusinessDay(BusinessDay): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` - calendar : HolidayCalendar instance - instance of AbstractHolidayCalendar that provide the list of holidays + calendar : pd.HolidayCalendar or np.busdaycalendar """ - _cacheable = False _prefix = 'C' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - - if 'calendar' in kwds: - holidays = kwds['calendar'].holidays() - else: - holidays = kwds.get('holidays', []) + calendar, holidays = self.get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + # CustomBusinessDay instances are identified by the + # following two attributes. See DateOffset._params() + # holidays, weekmask + + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + + def get_calendar(self, weekmask, holidays, calendar): + '''Generate busdaycalendar''' + if isinstance(calendar, np.busdaycalendar): + if not holidays: + holidays = tuple(calendar.holidays) + elif not isinstance(holidays, tuple): + holidays = tuple(holidays) + else: + # trust that calendar.holidays and holidays are + # consistent + pass + return calendar, holidays + + if holidays is None: + holidays = [] + try: + holidays = holidays + calendar.holidays().tolist() + except AttributeError: + pass holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in holidays] - self.holidays = tuple(sorted(holidays)) - self.kwds['holidays'] = self.holidays + holidays = tuple(sorted(holidays)) - self._set_busdaycalendar() + kwargs = {'weekmask': weekmask} + if holidays: + kwargs['holidays'] = holidays - def _set_busdaycalendar(self): - if self.holidays: - kwargs = {'weekmask':self.weekmask,'holidays':self.holidays} - else: - kwargs = {'weekmask':self.weekmask} try: - self.busdaycalendar = np.busdaycalendar(**kwargs) + busdaycalendar = np.busdaycalendar(**kwargs) except: # Check we have the required numpy version from distutils.version import LooseVersion @@ -552,17 +608,23 @@ def _set_busdaycalendar(self): np.__version__) else: raise + return busdaycalendar, holidays def __getstate__(self): """Return a pickleable state""" state = self.__dict__.copy() - del state['busdaycalendar'] + del state['calendar'] return state def __setstate__(self, state): """Reconstruct an instance from a pickled state""" self.__dict__ = state - self._set_busdaycalendar() + calendar, holidays = self.get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) + self.kwds['calendar'] = self.calendar = calendar + self.kwds['holidays'] = self.holidays = holidays + self.kwds['weekmask'] = state['weekmask'] @apply_wraps def apply(self, other): @@ -576,7 +638,7 @@ def apply(self, other): np_dt = np.datetime64(date_in.date()) np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll, - busdaycal=self.busdaycalendar) + busdaycal=self.calendar) dt_date = np_incr_dt.astype(datetime) result = datetime.combine(dt_date, date_in.time()) @@ -598,7 +660,6 @@ def _to_dt64(dt, dtype='datetime64'): # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') # Thus astype is needed to cast datetime to datetime64[D] - if getattr(dt, 'tzinfo', None) is not None: i8 = tslib.pydt_to_i8(dt) dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo) @@ -612,7 +673,7 @@ def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False day64 = self._to_dt64(dt,'datetime64[D]') - return np.is_busday(day64, busdaycal=self.busdaycalendar) + return np.is_busday(day64, busdaycal=self.calendar) class MonthOffset(SingleConstructorOffset): @@ -730,7 +791,6 @@ def onOffset(self, dt): _prefix = 'BMS' - class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): """ **EXPERIMENTAL** DateOffset of one custom business month @@ -751,18 +811,22 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar """ _cacheable = False _prefix = 'CBM' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - self.cbday = CustomBusinessDay(n=self.n, **kwds) - self.m_offset = MonthEnd() + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, + weekmask=weekmask, holidays=holidays, + calendar=calendar, **kwds) + self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds) + self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar @apply_wraps def apply(self,other): @@ -780,11 +844,11 @@ def apply(self,other): n -= 1 elif other > cur_cmend and n <= -1: n += 1 - - new = cur_mend + n * MonthEnd() + + new = cur_mend + n * self.m_offset result = self.cbday.rollback(new) return result - + class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): """ **EXPERIMENTAL** DateOffset of one custom business month @@ -805,18 +869,22 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar """ _cacheable = False _prefix = 'CBMS' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds) - self.m_offset = MonthBegin(normalize=normalize) + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, + weekmask=weekmask, holidays=holidays, + calendar=calendar, **kwds) + self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds) + self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar @apply_wraps def apply(self,other): @@ -835,8 +903,8 @@ def apply(self,other): n += 1 elif dt_in < cur_cmbegin and n >= 1: n -= 1 - - new = cur_mbegin + n * MonthBegin() + + new = cur_mbegin + n * self.m_offset result = self.cbday.rollforward(new) return result diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index dfea3e0486d32..b4d8a6547950d 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -8,7 +8,7 @@ import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index -from pandas.core.base import DatetimeIndexOpsMixin +from pandas.tseries.base import DatetimeIndexOpsMixin from pandas.tseries.tools import parse_time_string import pandas.tseries.offsets as offsets @@ -16,7 +16,7 @@ from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box, _values_from_object, ABCSeries) from pandas import compat -from pandas.lib import Timestamp +from pandas.lib import Timestamp, Timedelta import pandas.lib as lib import pandas.tslib as tslib import pandas.algos as _algos @@ -61,7 +61,6 @@ class Period(PandasObject): minute : int, default 0 second : int, default 0 """ - _typ = 'periodindex' __slots__ = ['freq', 'ordinal'] _comparables = ['name','freqstr'] @@ -171,7 +170,7 @@ def __hash__(self): return hash((self.ordinal, self.freq)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): + if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -198,7 +197,7 @@ def _add_delta(self, other): def __add__(self, other): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset)): + offsets.Tick, offsets.DateOffset, Timedelta)): return self._add_delta(other) elif com.is_integer(other): if self.ordinal == tslib.iNaT: @@ -211,7 +210,7 @@ def __add__(self, other): def __sub__(self, other): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset)): + offsets.Tick, offsets.DateOffset, Timedelta)): neg_other = -other return self + neg_other elif com.is_integer(other): @@ -606,10 +605,12 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): >>> idx2 = PeriodIndex(start='2000', end='2010', freq='A') """ _box_scalars = True + _typ = 'periodindex' _attributes = ['name','freq'] _datetimelike_ops = ['year','month','day','hour','minute','second', - 'weekofyear','week','dayofweek','weekday','dayofyear','quarter', 'qyear'] + 'weekofyear','week','dayofweek','weekday','dayofyear','quarter', 'qyear', 'freq'] _is_numeric_dtype = False + freq = None __eq__ = _period_index_cmp('__eq__') __ne__ = _period_index_cmp('__ne__', nat_result=True) @@ -746,6 +747,10 @@ def __contains__(self, key): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + return self.asobject.values + def asof_locs(self, where, mask): """ where : array of timestamps @@ -835,17 +840,6 @@ def to_datetime(self, dayfirst=False): quarter = _field_accessor('quarter', 2, "The quarter of the date") qyear = _field_accessor('qyear', 1) - # Try to run function on index first, and then on elements of index - # Especially important for group-by functionality - def map(self, f): - try: - result = f(self) - if not isinstance(result, (np.ndarray, Index)): - raise TypeError - return result - except Exception: - return _algos.arrmap_object(self.asobject.values, f) - def _get_object_array(self): freq = self.freq return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False) @@ -898,7 +892,7 @@ def to_timestamp(self, freq=None, how='start'): return DatetimeIndex(new_data, freq='infer', name=self.name) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): + if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -1125,9 +1119,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - def _format_with_header(self, header, **kwargs): - return header + self._format_native_types(**kwargs) - def _format_native_types(self, na_rep=u('NaT'), **kwargs): values = np.array(list(self), dtype=object) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 01aff164d8384..b362c55b156a4 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -5,7 +5,8 @@ from pandas.core.groupby import BinGrouper, Grouper from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod from pandas.tseries.index import DatetimeIndex, date_range -from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds +from pandas.tseries.tdi import TimedeltaIndex +from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds from pandas.tseries.period import PeriodIndex, period_range import pandas.tseries.tools as tools import pandas.core.common as com @@ -96,10 +97,12 @@ def resample(self, obj): obj = self.obj.to_timestamp(how=self.convention) self._set_grouper(obj) rs = self._resample_timestamps() + elif isinstance(ax, TimedeltaIndex): + rs = self._resample_timestamps(kind='timedelta') elif len(ax) == 0: return self.obj else: # pragma: no cover - raise TypeError('Only valid with DatetimeIndex or PeriodIndex') + raise TypeError('Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex') rs_axis = rs._get_axis(self.axis) rs_axis.name = ax.name @@ -109,13 +112,17 @@ def _get_grouper(self, obj): self._set_grouper(obj) return self._get_binner_for_resample() - def _get_binner_for_resample(self): + def _get_binner_for_resample(self, kind=None): # create the BinGrouper # assume that self.set_grouper(obj) has already been called ax = self.ax - if self.kind is None or self.kind == 'timestamp': + if kind is None: + kind = self.kind + if kind is None or kind == 'timestamp': self.binner, bins, binlabels = self._get_time_bins(ax) + elif kind == 'timedelta': + self.binner, bins, binlabels = self._get_time_delta_bins(ax) else: self.binner, bins, binlabels = self._get_time_period_bins(ax) @@ -217,6 +224,25 @@ def _adjust_bin_edges(self, binner, ax_values): return binner, bin_edges + def _get_time_delta_bins(self, ax): + if not isinstance(ax, TimedeltaIndex): + raise TypeError('axis must be a TimedeltaIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + if not len(ax): + binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + labels = binner = TimedeltaIndex(start=ax[0], + end=ax[-1], + freq=self.freq, + name=ax.name) + + end_stamps = labels + 1 + bins = ax.searchsorted(end_stamps, side='left') + + return binner, bins, labels + def _get_time_period_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' @@ -242,11 +268,11 @@ def _get_time_period_bins(self, ax): def _agg_method(self): return self.how if self.how else _DEFAULT_METHOD - def _resample_timestamps(self): + def _resample_timestamps(self, kind=None): # assumes set_grouper(obj) already called axlabels = self.ax - self._get_binner_for_resample() + self._get_binner_for_resample(kind=kind) grouper = self.grouper binner = self.binner obj = self.obj @@ -359,9 +385,11 @@ def _get_range_edges(first, last, offset, closed='left', base=0): offset = to_offset(offset) if isinstance(offset, Tick): + is_day = isinstance(offset, Day) day_nanos = _delta_to_nanoseconds(timedelta(1)) + # #1165 - if (day_nanos % offset.nanos) == 0: + if (is_day and day_nanos % offset.nanos == 0) or not is_day: return _adjust_dates_anchored(first, last, offset, closed=closed, base=base) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py new file mode 100644 index 0000000000000..4822be828a2c3 --- /dev/null +++ b/pandas/tseries/tdi.py @@ -0,0 +1,982 @@ +""" implement the TimedeltaIndex """ + +import operator +import datetime +from datetime import timedelta +import numpy as np + +from pandas.core.common import (ABCSeries, _TD_DTYPE, _INT64_DTYPE, + is_timedelta64_dtype, _maybe_box, + _values_from_object, isnull) +from pandas.core.index import Index, Int64Index +import pandas.compat as compat +from pandas.compat import u +from pandas.core.base import PandasObject +from pandas.util.decorators import cache_readonly +from pandas.tseries.frequencies import to_offset +import pandas.core.common as com +from pandas.tseries import timedeltas +from pandas.tseries.base import DatetimeIndexOpsMixin +from pandas.tseries.timedeltas import to_timedelta, _coerce_scalar_to_timedelta_type +import pandas.tseries.offsets as offsets +from pandas.tseries.offsets import Tick, DateOffset + +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.algos as _algos +import pandas.index as _index + +Timedelta = tslib.Timedelta + +_resolution_map = { + 'ns' : offsets.Nano, + 'us' : offsets.Micro, + 'ms' : offsets.Milli, + 's' : offsets.Second, + 'm' : offsets.Minute, + 'h' : offsets.Hour, + 'D' : offsets.Day, + } + +def _td_index_cmp(opname, nat_result=False): + """ + Wrap comparison operations to convert timedelta-like to timedelta64 + """ + def wrapper(self, other): + func = getattr(super(TimedeltaIndex, self), opname) + if _is_convertible_to_td(other): + other = _to_m8(other) + result = func(other) + if com.isnull(other): + result.fill(nat_result) + else: + if not com.is_list_like(other): + raise TypeError("cannot compare a TimedeltaIndex with type {0}".format(type(other))) + + other = TimedeltaIndex(other).values + result = func(other) + result = _values_from_object(result) + + if isinstance(other, Index): + o_mask = other.values.view('i8') == tslib.iNaT + else: + o_mask = other.view('i8') == tslib.iNaT + + if o_mask.any(): + result[o_mask] = nat_result + + mask = self.asi8 == tslib.iNaT + if mask.any(): + result[mask] = nat_result + + # support of bool dtype indexers + if com.is_bool_dtype(result): + return result + return Index(result) + + return wrapper + +class TimedeltaIndex(DatetimeIndexOpsMixin, Int64Index): + """ + Immutable ndarray of timedelta64 data, represented internally as int64, and + which can be boxed to timedelta objects + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional timedelta-like data to construct index with + unit: unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional + which is an integer/float number + freq: a frequency for the index, optional + copy : bool + Make a copy of input ndarray + start : starting value, timedelta-like, optional + If data is None, start is used as the start point in generating regular + timedelta data. + periods : int, optional, > 0 + Number of periods to generate, if generating index. Takes precedence + over end argument + end : end time, timedelta-like, optional + If periods is none, generated index will extend to first conforming + time on or just past end argument + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + name : object + Name to be stored in the index + """ + + _typ = 'timedeltaindex' + _join_precedence = 10 + def _join_i8_wrapper(joinf, **kwargs): + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='m8[ns]', **kwargs) + + _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _left_indexer_unique = _join_i8_wrapper( + _algos.left_join_indexer_unique_int64, with_indexers=False) + _arrmap = None + _datetimelike_ops = ['days','hours','minutes','seconds','milliseconds','microseconds', + 'nanoseconds','freq','components'] + + __eq__ = _td_index_cmp('__eq__') + __ne__ = _td_index_cmp('__ne__', nat_result=True) + __lt__ = _td_index_cmp('__lt__') + __gt__ = _td_index_cmp('__gt__') + __le__ = _td_index_cmp('__le__') + __ge__ = _td_index_cmp('__ge__') + + _engine_type = _index.TimedeltaEngine + + _comparables = ['name','freq'] + _attributes = ['name','freq'] + _is_numeric_dtype = True + freq = None + + def __new__(cls, data=None, unit=None, + freq=None, start=None, end=None, periods=None, + copy=False, name=None, + closed=None, verify_integrity=True, **kwargs): + + if isinstance(data, TimedeltaIndex) and freq is None: + if copy: + data = data.copy() + return data + + freq_infer = False + if not isinstance(freq, DateOffset): + + # if a passed freq is None, don't infer automatically + if freq != 'infer': + freq = to_offset(freq) + else: + freq_infer = True + freq = None + + if periods is not None: + if com.is_float(periods): + periods = int(periods) + elif not com.is_integer(periods): + raise ValueError('Periods must be a number, got %s' % + str(periods)) + + if data is None and freq is None: + raise ValueError("Must provide freq argument if no data is " + "supplied") + + if data is None: + return cls._generate(start, end, periods, name, freq, + closed=closed) + + if unit is not None: + data = to_timedelta(data, unit=unit, box=False) + + if not isinstance(data, (np.ndarray, Index, ABCSeries)): + if np.isscalar(data): + raise ValueError('TimedeltaIndex() must be called with a ' + 'collection of some kind, %s was passed' + % repr(data)) + + # convert if not already + if getattr(data,'dtype',None) != _TD_DTYPE: + data = to_timedelta(data,unit=unit,box=False) + elif copy: + data = np.array(data,copy=True) + + # check that we are matching freqs + if verify_integrity and len(data) > 0: + if freq is not None and not freq_infer: + index = cls._simple_new(data, name=name) + inferred = index.inferred_freq + if inferred != freq.freqstr: + on_freq = cls._generate(index[0], None, len(index), name, freq) + if not np.array_equal(index.asi8, on_freq.asi8): + raise ValueError('Inferred frequency {0} from passed timedeltas does not ' + 'conform to passed frequency {1}'.format(inferred, freq.freqstr)) + index.freq = freq + return index + + if freq_infer: + index = cls._simple_new(data, name=name) + inferred = index.inferred_freq + if inferred: + index.freq = to_offset(inferred) + return index + + return cls._simple_new(data, name=name, freq=freq) + + @classmethod + def _generate(cls, start, end, periods, name, offset, closed=None): + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Must specify two of start, end, or periods') + + if start is not None: + start = Timedelta(start) + + if end is not None: + end = Timedelta(end) + + left_closed = False + right_closed = False + + if start is None and end is None: + if closed is not None: + raise ValueError("Closed has to be None if not both of start" + "and end are defined") + + if closed is None: + left_closed = True + right_closed = True + elif closed == "left": + left_closed = True + elif closed == "right": + right_closed = True + else: + raise ValueError("Closed has to be either 'left', 'right' or None") + + index = _generate_regular_range(start, end, periods, offset) + index = cls._simple_new(index, name=name, freq=offset) + + if not left_closed: + index = index[1:] + if not right_closed: + index = index[:-1] + + return index + + @property + def _box_func(self): + return lambda x: Timedelta(x,unit='ns') + + @classmethod + def _simple_new(cls, values, name=None, freq=None, **kwargs): + if not getattr(values,'dtype',None): + values = np.array(values,copy=False) + if values.dtype == np.object_: + values = tslib.array_to_timedelta64(values) + if values.dtype != _TD_DTYPE: + values = com._ensure_int64(values).view(_TD_DTYPE) + + result = object.__new__(cls) + result._data = values + result.name = name + result.freq = freq + result._reset_identity() + return result + + _na_value = tslib.NaT + """The expected NA value to use with this index.""" + + @property + def _formatter_func(self): + from pandas.core.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) + + def _format_footer(self): + tagline = 'Length: %d, Freq: %s' + return tagline % (len(self), self.freqstr) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if isinstance(state, dict): + super(TimedeltaIndex, self).__setstate__(state) + else: + raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ + + def _add_delta(self, delta): + if isinstance(delta, (Tick, timedelta, np.timedelta64)): + new_values = self._add_delta_td(delta) + elif isinstance(delta, TimedeltaIndex): + new_values = self._add_delta_tdi(delta) + else: + raise ValueError("cannot add the type {0} to a TimedeltaIndex".format(type(delta))) + + result = TimedeltaIndex(new_values, freq='infer') + return result + + def _evaluate_with_timedelta_like(self, other, op, opstr): + + # allow division by a timedelta + if opstr in ['__div__','__truediv__']: + if _is_convertible_to_td(other): + other = Timedelta(other) + if isnull(other): + raise NotImplementedError("division by pd.NaT not implemented") + + i8 = self.asi8 + result = i8/float(other.value) + if self.hasnans: + mask = i8 == tslib.iNaT + result = result.astype('float64') + result[mask] = np.nan + return Index(result,name=self.name,copy=False) + + raise TypeError("can only perform ops with timedelta like values") + + def _add_datelike(self, other): + + # adding a timedeltaindex to a datetimelike + from pandas import Timestamp, DatetimeIndex + other = Timestamp(other) + i8 = self.asi8 + result = i8 + other.value + if self.hasnans: + mask = i8 == tslib.iNaT + result[mask] = tslib.iNaT + return DatetimeIndex(result,name=self.name,copy=False) + + def _sub_datelike(self, other): + raise TypeError("cannot subtract a datelike from a TimedeltaIndex") + + def _format_native_types(self, na_rep=u('NaT'), + date_format=None, **kwargs): + from pandas.core.format import Timedelta64Formatter + return Timedelta64Formatter(values=self, + nat_rep=na_rep, + justify='all').get_result() + + def _get_field(self, m): + + values = self.asi8 + hasnans = self.hasnans + if hasnans: + result = np.empty(len(self), dtype='float64') + mask = values == tslib.iNaT + imask = ~mask + result.flat[imask] = np.array([ getattr(Timedelta(val),m) for val in values[imask] ]) + result[mask] = np.nan + else: + result = np.array([ getattr(Timedelta(val),m) for val in values ],dtype='int64') + return result + + @property + def days(self): + """ The number of integer days for each element """ + return self._get_field('days') + + @property + def hours(self): + """ The number of integer hours for each element """ + return self._get_field('hours') + + @property + def minutes(self): + """ The number of integer minutes for each element """ + return self._get_field('minutes') + + @property + def seconds(self): + """ The number of integer seconds for each element """ + return self._get_field('seconds') + + @property + def milliseconds(self): + """ The number of integer milliseconds for each element """ + return self._get_field('milliseconds') + + @property + def microseconds(self): + """ The number of integer microseconds for each element """ + return self._get_field('microseconds') + + @property + def nanoseconds(self): + """ The number of integer nanoseconds for each element """ + return self._get_field('nanoseconds') + + @property + def components(self): + """ + Return a dataframe of the components of the Timedeltas + + Returns + ------- + a DataFrame + """ + from pandas import DataFrame + + columns = ['days','hours','minutes','seconds','milliseconds','microseconds','nanoseconds'] + hasnans = self.hasnans + if hasnans: + def f(x): + if isnull(x): + return [np.nan]*len(columns) + return x.components + else: + def f(x): + return x.components + + result = DataFrame([ f(x) for x in self ]) + result.columns = columns + if not hasnans: + result = result.astype('int64') + return result + + def summary(self, name=None): + formatter = self._formatter_func + if len(self) > 0: + index_summary = ', %s to %s' % (formatter(self[0]), + formatter(self[-1])) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + result = '%s: %s entries%s' % (com.pprint_thing(name), + len(self), index_summary) + if self.freq: + result += '\nFreq: %s' % self.freqstr + + return result + + def to_pytimedelta(self): + """ + Return TimedeltaIndex as object ndarray of datetime.timedelta objects + + Returns + ------- + datetimes : ndarray + """ + return tslib.ints_to_pytimedelta(self.asi8) + + def astype(self, dtype): + dtype = np.dtype(dtype) + + if dtype == np.object_: + return self.asobject + elif dtype == _INT64_DTYPE: + return self.asi8.copy() + elif dtype == _TD_DTYPE: + return self + elif dtype.kind == 'm': + + # return an index (essentially this is division) + result = self.values.astype(dtype) + if self.hasnans: + result = result.astype('float64') + result[self.asi8 == tslib.iNaT] = np.nan + return Index(result,name=self.name) + + return Index(result.astype('i8'),name=self.name) + + else: # pragma: no cover + raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) + + def union(self, other): + """ + Specialized union for TimedeltaIndex objects. If combine + overlapping ranges with the same DateOffset, will be much + faster than Index.union + + Parameters + ---------- + other : TimedeltaIndex or array-like + + Returns + ------- + y : Index or TimedeltaIndex + """ + if _is_convertible_to_index(other): + try: + other = TimedeltaIndex(other) + except TypeError: + pass + + this, other = self, other + + if this._can_fast_union(other): + return this._fast_union(other) + else: + result = Index.union(this, other) + if isinstance(result, TimedeltaIndex): + if result.freq is None: + result.freq = to_offset(result.inferred_freq) + return result + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = self._ensure_compat_concat(to_concat) + return Index(com._concat_compat(to_concat), name=name) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + See Index.join + """ + if _is_convertible_to_index(other): + try: + other = TimedeltaIndex(other) + except (TypeError, ValueError): + pass + + return Index.join(self, other, how=how, level=level, + return_indexers=return_indexers) + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + if (isinstance(other, TimedeltaIndex) and self.freq == other.freq + and self._can_fast_union(other)): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + return self._simple_new(joined, name) + + def _can_fast_union(self, other): + if not isinstance(other, TimedeltaIndex): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + return (right_start == left_end + freq) or right_start in left + + def _fast_union(self, other): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + left_start, left_end = left[0], left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side='right') + right_chunk = right.values[loc:] + dates = com._concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + return left + + def __array_finalize__(self, obj): + if self.ndim == 0: # pragma: no cover + return self.item() + + self.name = getattr(obj, 'name', None) + self.freq = getattr(obj, 'freq', None) + self._reset_identity() + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + return self._simple_new(result, name=name, freq=None) + + def intersection(self, other): + """ + Specialized intersection for TimedeltaIndex objects. May be much faster + than Index.intersection + + Parameters + ---------- + other : TimedeltaIndex or array-like + + Returns + ------- + y : Index or TimedeltaIndex + """ + if not isinstance(other, TimedeltaIndex): + try: + other = TimedeltaIndex(other) + except (TypeError, ValueError): + pass + result = Index.intersection(self, other) + return result + + if len(self) == 0: + return self + if len(other) == 0: + return other + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _possibly_promote(self, other): + if other.inferred_type == 'timedelta': + other = TimedeltaIndex(other) + return self, other + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + + if _is_convertible_to_td(key): + key = Timedelta(key) + return self.get_value_maybe_box(series, key) + + try: + return _maybe_box(self, Index.get_value(self, series, key), series, key) + except KeyError: + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError, KeyError): + pass + + try: + return self.get_value_maybe_box(series, key) + except (TypeError, ValueError, KeyError): + raise KeyError(key) + + def get_value_maybe_box(self, series, key): + if not isinstance(key, Timedelta): + key = Timedelta(key) + values = self._engine.get_value(_values_from_object(series), key) + return _maybe_box(self, values, series, key) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + if _is_convertible_to_td(key): + key = Timedelta(key) + return self._engine.get_loc(key) + + try: + return Index.get_loc(self, key) + except (KeyError, ValueError): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError): + pass + + try: + stamp = Timedelta(key) + return self._engine.get_loc(stamp) + except (KeyError, ValueError): + raise KeyError(key) + + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + freq = getattr(self, 'freqstr', + getattr(self, 'inferred_freq', None)) + + loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, + use_rhs=use_rhs) + return loc + + def _partial_td_slice(self, key, freq, use_lhs=True, use_rhs=True): + + # given a key, try to figure out a location for a partial slice + if not isinstance(key, compat.string_types): + return key + + parsed = _coerce_scalar_to_timedelta_type(key, box=True) + + is_monotonic = self.is_monotonic + + # figure out the resolution of the passed td + # and round to it + reso = parsed.resolution + t1 = parsed.round(reso) + t2 = t1 + _resolution_map[reso]() - Timedelta(1,'ns') + + stamps = self.asi8 + + if is_monotonic: + + # we are out of range + if len(stamps) and ( + (use_lhs and t1.value < stamps[0] and t2.value < stamps[0]) or ( + (use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1]))): + raise KeyError + + # a monotonic (sorted) series can be sliced + left = stamps.searchsorted(t1.value, side='left') if use_lhs else None + right = stamps.searchsorted(t2.value, side='right') if use_rhs else None + + return slice(left, right) + + lhs_mask = (stamps >= t1.value) if use_lhs else True + rhs_mask = (stamps <= t2.value) if use_rhs else True + + # try to find a the dates + return (lhs_mask & rhs_mask).nonzero()[0] + + def __getitem__(self, key): + getitem = self._data.__getitem__ + if np.isscalar(key): + val = getitem(key) + return Timedelta(val) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + if key.all(): + key = slice(0,None,None) + else: + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + + result = getitem(key) + if result.ndim > 1: + return result + + return self._simple_new(result, self.name) + + @property + def freqstr(self): + """ return the frequency object as a string if its set, otherwise None """ + if self.freq is None: + return None + return self.freq + + def searchsorted(self, key, side='left'): + if isinstance(key, (np.ndarray, Index)): + key = np.array(key, dtype=_TD_DTYPE, copy=False) + else: + key = _to_m8(key) + + return self.values.searchsorted(key, side=side) + + def is_type_compatible(self, typ): + return typ == self.inferred_type or typ == 'timedelta' + + @property + def inferred_type(self): + return 'timedelta64' + + @property + def dtype(self): + return _TD_DTYPE + + @property + def is_all_dates(self): + return True + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if (not hasattr(other, 'inferred_type') or + other.inferred_type != 'timedelta64'): + try: + other = TimedeltaIndex(other) + except: + return False + + return np.array_equal(self.asi8, other.asi8) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ + + # try to convert if possible + if _is_convertible_to_td(item): + try: + item = Timedelta(item) + except: + pass + + freq = None + if isinstance(item, Timedelta): + + # check freq can be preserved on edge cases + if self.freq is not None: + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = _to_m8(item) + + try: + new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)], + self[loc:].asi8)) + return TimedeltaIndex(new_tds, name=self.name, freq=freq) + + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item,compat.string_types): + return self.asobject.insert(loc, item) + raise TypeError("cannot insert TimedeltaIndex with incompatible label") + + def delete(self, loc): + """ + Make a new DatetimeIndex with passed location(s) deleted. + + Parameters + ---------- + loc: int, slice or array of ints + Indicate which sub-arrays to remove. + + Returns + ------- + new_index : TimedeltaIndex + """ + new_tds = np.delete(self.asi8, loc) + + freq = 'infer' + if lib.is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if com.is_list_like(loc): + loc = lib.maybe_indices_to_slice(com._ensure_int64(np.array(loc))) + if isinstance(loc, slice) and loc.step in (1, None): + if (loc.start in (0, None) or loc.stop in (len(self), None)): + freq = self.freq + + return TimedeltaIndex(new_tds, name=self.name, freq=freq) + +TimedeltaIndex._add_numeric_methods() + +def _is_convertible_to_index(other): + """ return a boolean whether I can attempt conversion to a TimedeltaIndex """ + if isinstance(other, TimedeltaIndex): + return True + elif (len(other) > 0 and + other.inferred_type not in ('floating', 'mixed-integer','integer', + 'mixed-integer-float', 'mixed')): + return True + return False + + +def _is_convertible_to_td(key): + return isinstance(key, (DateOffset, timedelta, Timedelta, np.timedelta64, compat.string_types)) + +def _to_m8(key): + ''' + Timedelta-like => dt64 + ''' + if not isinstance(key, Timedelta): + # this also converts strings + key = Timedelta(key) + + # return an type that can be compared + return np.int64(key.value).view(_TD_DTYPE) + +def _generate_regular_range(start, end, periods, offset): + stride = offset.nanos + if periods is None: + b = Timedelta(start).value + e = Timedelta(end).value + e += stride - e % stride + elif start is not None: + b = Timedelta(start).value + e = b + periods * stride + elif end is not None: + e = Timedelta(end).value + stride + b = e - periods * stride + else: + raise NotImplementedError + + data = np.arange(b, e, stride, dtype=np.int64) + data = TimedeltaIndex._simple_new(data, None) + + return data + + +def timedelta_range(start=None, end=None, periods=None, freq='D', + name=None, closed=None): + """ + Return a fixed frequency timedelta index, with day as the default + frequency + + Parameters + ---------- + start : string or timedelta-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'D' (calendar daily) + Frequency strings can have multiples, e.g. '5H' + name : str, default None + Name of the resulting index + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : TimedeltaIndex + """ + return TimedeltaIndex(start=start, end=end, periods=periods, + freq=freq, name=name, + closed=closed) + + diff --git a/pandas/tseries/tests/data/cday-0.14.1.pickle b/pandas/tseries/tests/data/cday-0.14.1.pickle new file mode 100644 index 0000000000000..48488099482e4 Binary files /dev/null and b/pandas/tseries/tests/data/cday-0.14.1.pickle differ diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py new file mode 100644 index 0000000000000..367ea276646ee --- /dev/null +++ b/pandas/tseries/tests/test_base.py @@ -0,0 +1,966 @@ +from __future__ import print_function +import re +from datetime import datetime, timedelta +import numpy as np +import pandas as pd +from pandas.tseries.base import DatetimeIndexOpsMixin +from pandas.util.testing import assertRaisesRegexp, assert_isinstance +from pandas.tseries.common import is_datetimelike +from pandas import (Series, Index, Int64Index, Timestamp, DatetimeIndex, PeriodIndex, + TimedeltaIndex, Timedelta, timedelta_range, date_range, Float64Index) +import pandas.tslib as tslib +import nose + +import pandas.util.testing as tm + +from pandas.tests.test_base import Ops + +class TestDatetimeIndexOps(Ops): + tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/Asia/Singapore', 'dateutil/US/Pacific'] + + def setUp(self): + super(TestDatetimeIndexOps, self).setUp() + mask = lambda x: isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex) + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['date','time','microsecond','nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', + 'is_quarter_end', 'is_year_start', 'is_year_end'], lambda x: isinstance(x,DatetimeIndex)) + + def test_ops_properties_basic(self): + + # sanity check that the behavior didn't change + # GH7206 + for op in ['year','day','second','weekday']: + self.assertRaises(TypeError, lambda x: getattr(self.dt_series,op)) + + # attribute access should still work! + s = Series(dict(year=2000,month=1,day=10)) + self.assertEquals(s.year,2000) + self.assertEquals(s.month,1) + self.assertEquals(s.day,10) + self.assertRaises(AttributeError, lambda : s.weekday) + + def test_asobject_tolist(self): + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') + expected_list = [pd.Timestamp('2013-01-31'), pd.Timestamp('2013-02-28'), + pd.Timestamp('2013-03-31'), pd.Timestamp('2013-04-30')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx', tz='Asia/Tokyo') + expected_list = [pd.Timestamp('2013-01-31', tz='Asia/Tokyo'), + pd.Timestamp('2013-02-28', tz='Asia/Tokyo'), + pd.Timestamp('2013-03-31', tz='Asia/Tokyo'), + pd.Timestamp('2013-04-30', tz='Asia/Tokyo')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), + pd.NaT, datetime(2013, 1, 4)], name='idx') + expected_list = [pd.Timestamp('2013-01-01'), pd.Timestamp('2013-01-02'), + pd.NaT, pd.Timestamp('2013-01-04')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + def test_minmax(self): + for tz in self.tz: + # monotonic + idx1 = pd.DatetimeIndex([pd.NaT, '2011-01-01', '2011-01-02', + '2011-01-03'], tz=tz) + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', + '2011-01-02', pd.NaT], tz=tz) + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), pd.Timestamp('2011-01-01', tz=tz)) + self.assertEqual(idx.max(), pd.Timestamp('2011-01-03', tz=tz)) + + for op in ['min', 'max']: + # Return NaT + obj = DatetimeIndex([]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = DatetimeIndex([pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + def test_representation(self): + idx1 = DatetimeIndex([], freq='D') + idx2 = DatetimeIndex(['2011-01-01'], freq='D') + idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], + freq='H', tz='Asia/Tokyo') + idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern') + + exp1 = """ +Length: 0, Freq: D, Timezone: None""" + exp2 = """ +[2011-01-01] +Length: 1, Freq: D, Timezone: None""" + exp3 = """ +[2011-01-01, 2011-01-02] +Length: 2, Freq: D, Timezone: None""" + exp4 = """ +[2011-01-01, ..., 2011-01-03] +Length: 3, Freq: D, Timezone: None""" + exp5 = """ +[2011-01-01 09:00:00+09:00, ..., 2011-01-01 11:00:00+09:00] +Length: 3, Freq: H, Timezone: Asia/Tokyo""" + exp6 = """ +[2011-01-01 09:00:00-05:00, ..., NaT] +Length: 3, Freq: None, Timezone: US/Eastern""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], + [exp1, exp2, exp3, exp4, exp5, exp6]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + def test_resolution(self): + for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], + ['day', 'day', 'day', 'day', + 'hour', 'minute', 'second', 'millisecond', 'microsecond']): + for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) + self.assertEqual(idx.resolution, expected) + + def test_add_iadd(self): + for tz in self.tz: + # union + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), + (rng3, other3, expected3)]: + result_add = rng + other + result_union = rng.union(other) + + tm.assert_index_equal(result_add, expected) + tm.assert_index_equal(result_union, expected) + rng += other + tm.assert_index_equal(rng, expected) + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + result = rng + delta + expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, tz=tz) + result = rng + 1 + expected = pd.date_range('2000-01-01 10:00', freq='H', periods=10, tz=tz) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub_isub(self): + for tz in self.tz: + # diff + rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + expected1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + expected2 = pd.date_range('1/1/2000', freq='D', periods=3, tz=tz) + + rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + + for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), + (rng3, other3, expected3)]: + result_union = rng.difference(other) + + tm.assert_index_equal(result_union, expected) + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), + Timedelta(hours=2)] + + for delta in offsets: + rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) + result = rng - delta + expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = pd.date_range('2000-01-01 09:00', freq='H', periods=10, tz=tz) + result = rng - 1 + expected = pd.date_range('2000-01-01 08:00', freq='H', periods=10, tz=tz) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + def test_value_counts_unique(self): + # GH 7735 + for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + + exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + tm.assert_series_equal(idx.value_counts(), expected) + + expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) + tm.assert_index_equal(idx.unique(), expected) + + idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + expected = Series([3, 2], index=exp_idx) + tm.assert_series_equal(idx.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + expected = Series([3, 2, 1], index=exp_idx) + tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + +class TestTimedeltaIndexOps(Ops): + + def setUp(self): + super(TestTimedeltaIndexOps, self).setUp() + mask = lambda x: isinstance(x, TimedeltaIndex) + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ ] + + def test_ops_properties(self): + self.check_ops_properties(['days','hours','minutes','seconds','milliseconds']) + self.check_ops_properties(['microseconds','nanoseconds']) + + def test_asobject_tolist(self): + idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') + expected_list = [Timedelta('1 days'),Timedelta('2 days'),Timedelta('3 days'), + Timedelta('4 days')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = TimedeltaIndex([timedelta(days=1),timedelta(days=2),pd.NaT, + timedelta(days=4)], name='idx') + expected_list = [Timedelta('1 days'),Timedelta('2 days'),pd.NaT, + Timedelta('4 days')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + def test_minmax(self): + + # monotonic + idx1 = TimedeltaIndex(['nat', '1 days', '2 days', '3 days']) + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), Timedelta('1 days')), + self.assertEqual(idx.max(), Timedelta('3 days')), + + for op in ['min', 'max']: + # Return NaT + obj = TimedeltaIndex([]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = TimedeltaIndex([pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) + self.assertTrue(pd.isnull(getattr(obj, op)())) + + def test_representation(self): + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + + exp1 = """ +Length: 0, Freq: """ + exp2 = """ +['1 days'] +Length: 1, Freq: """ + exp3 = """ +['1 days', '2 days'] +Length: 2, Freq: """ + exp4 = """ +['1 days', ..., '3 days'] +Length: 3, Freq: """ + exp5 = """ +['1 days 00:00:01', ..., '3 days 00:00:00'] +Length: 3, Freq: None""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + def test_add_iadd(self): + + # only test adding/sub offsets as + is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), + np.timedelta64(2, 'h'), Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days','10 days') + result = rng + delta + expected = timedelta_range('1 days 02:00:00','10 days 02:00:00',freq='D') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng + 1 + expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub_isub(self): + + # only test adding/sub offsets as - is now numeric + + # offset + offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), + Timedelta(hours=2)] + + for delta in offsets: + rng = timedelta_range('1 days','10 days') + result = rng - delta + expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + # int + rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + result = rng - 1 + expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + def test_ops_compat(self): + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), + Timedelta(hours=2)] + + rng = timedelta_range('1 days','10 days',name='foo') + + # multiply + for offset in offsets: + self.assertRaises(TypeError, lambda : rng * offset) + + # divide + expected = Int64Index((np.arange(10)+1)*12,name='foo') + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result,expected) + + # divide with nats + rng = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') + expected = Float64Index([12,np.nan,24]) + for offset in offsets: + result = rng / offset + tm.assert_index_equal(result,expected) + + # don't allow division by NaT (make could in the future) + self.assertRaises(TypeError, lambda : rng / pd.NaT) + + def test_subtraction_ops(self): + + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') + dti = date_range('20130101',periods=3) + td = Timedelta('1 days') + dt = Timestamp('20130101') + + self.assertRaises(TypeError, lambda : tdi - dt) + self.assertRaises(TypeError, lambda : tdi - dti) + self.assertRaises(TypeError, lambda : td - dt) + self.assertRaises(TypeError, lambda : td - dti) + + result = dt-dti + expected = TimedeltaIndex(['0 days','-1 days','-2 days']) + tm.assert_index_equal(result,expected) + + result = dti-dt + expected = TimedeltaIndex(['0 days','1 days','2 days']) + tm.assert_index_equal(result,expected) + + result = tdi-td + expected = TimedeltaIndex(['0 days',pd.NaT,'1 days']) + tm.assert_index_equal(result,expected) + + result = td-tdi + expected = TimedeltaIndex(['0 days',pd.NaT,'-1 days']) + tm.assert_index_equal(result,expected) + + result = dti-td + expected = DatetimeIndex(['20121231','20130101','20130102']) + tm.assert_index_equal(result,expected) + + result = dt-tdi + expected = DatetimeIndex(['20121231',pd.NaT,'20121230']) + tm.assert_index_equal(result,expected) + + def test_dti_tdi_numeric_ops(self): + + # These are normally union/diff set-like ops + tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') + dti = date_range('20130101',periods=3) + td = Timedelta('1 days') + dt = Timestamp('20130101') + + result = tdi-tdi + expected = TimedeltaIndex(['0 days',pd.NaT,'0 days']) + tm.assert_index_equal(result,expected) + + result = tdi+tdi + expected = TimedeltaIndex(['2 days',pd.NaT,'4 days']) + tm.assert_index_equal(result,expected) + + result = dti-tdi + expected = DatetimeIndex(['20121231',pd.NaT,'20130101']) + tm.assert_index_equal(result,expected) + + def test_addition_ops(self): + + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(['1 days',pd.NaT,'2 days'],name='foo') + dti = date_range('20130101',periods=3) + td = Timedelta('1 days') + dt = Timestamp('20130101') + + result = tdi + dt + expected = DatetimeIndex(['20130102',pd.NaT,'20130103']) + tm.assert_index_equal(result,expected) + + result = dt + tdi + expected = DatetimeIndex(['20130102',pd.NaT,'20130103']) + tm.assert_index_equal(result,expected) + + result = td + tdi + expected = TimedeltaIndex(['2 days',pd.NaT,'3 days']) + tm.assert_index_equal(result,expected) + + result = tdi + td + expected = TimedeltaIndex(['2 days',pd.NaT,'3 days']) + tm.assert_index_equal(result,expected) + + # unequal length + self.assertRaises(ValueError, lambda : tdi + dti[0:1]) + self.assertRaises(ValueError, lambda : tdi[0:1] + dti) + + # random indexes + self.assertRaises(TypeError, lambda : tdi + Int64Index([1,2,3])) + + # this is a union! + #self.assertRaises(TypeError, lambda : Int64Index([1,2,3]) + tdi) + + result = tdi + dti + expected = DatetimeIndex(['20130102',pd.NaT,'20130105']) + tm.assert_index_equal(result,expected) + + result = dti + tdi + expected = DatetimeIndex(['20130102',pd.NaT,'20130105']) + tm.assert_index_equal(result,expected) + + result = dt + td + expected = Timestamp('20130102') + self.assertEqual(result,expected) + + result = td + dt + expected = Timestamp('20130102') + self.assertEqual(result,expected) + + def test_value_counts_unique(self): + # GH 7735 + + idx = timedelta_range('1 days 09:00:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) + + exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + tm.assert_series_equal(idx.value_counts(), expected) + + expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) + tm.assert_index_equal(idx.unique(), expected) + + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00', + '1 days 08:00:00', '1 days 08:00:00', pd.NaT]) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) + expected = Series([3, 2], index=exp_idx) + tm.assert_series_equal(idx.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT]) + expected = Series([3, 2, 1], index=exp_idx) + tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + +class TestPeriodIndexOps(Ops): + + def setUp(self): + super(TestPeriodIndexOps, self).setUp() + mask = lambda x: isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex) + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['qyear'], lambda x: isinstance(x,PeriodIndex)) + + def test_asobject_tolist(self): + idx = pd.period_range(start='2013-01-01', periods=4, freq='M', name='idx') + expected_list = [pd.Period('2013-01-31', freq='M'), pd.Period('2013-02-28', freq='M'), + pd.Period('2013-03-31', freq='M'), pd.Period('2013-04-30', freq='M')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', '2013-01-04'], freq='D', name='idx') + expected_list = [pd.Period('2013-01-01', freq='D'), pd.Period('2013-01-02', freq='D'), + pd.Period('NaT', freq='D'), pd.Period('2013-01-04', freq='D')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + for i in [0, 1, 3]: + self.assertTrue(result[i], expected[i]) + self.assertTrue(result[2].ordinal, pd.tslib.iNaT) + self.assertTrue(result[2].freq, 'D') + self.assertEqual(result.name, expected.name) + + result_list = idx.tolist() + for i in [0, 1, 3]: + self.assertTrue(result_list[i], expected_list[i]) + self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) + self.assertTrue(result_list[2].freq, 'D') + + def test_minmax(self): + + # monotonic + idx1 = pd.PeriodIndex([pd.NaT, '2011-01-01', '2011-01-02', + '2011-01-03'], freq='D') + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = pd.PeriodIndex(['2011-01-01', pd.NaT, '2011-01-03', + '2011-01-02', pd.NaT], freq='D') + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), pd.Period('2011-01-01', freq='D')) + self.assertEqual(idx.max(), pd.Period('2011-01-03', freq='D')) + + for op in ['min', 'max']: + # Return NaT + obj = PeriodIndex([], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + obj = PeriodIndex([pd.NaT], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + def test_representation(self): + # GH 7601 + idx1 = PeriodIndex([], freq='D') + idx2 = PeriodIndex(['2011-01-01'], freq='D') + idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') + + idx7 = pd.period_range('2013Q1', periods=1, freq="Q") + idx8 = pd.period_range('2013Q1', periods=2, freq="Q") + idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + + exp1 = """ +Length: 0, Freq: D""" + exp2 = """ +[2011-01-01] +Length: 1, Freq: D""" + exp3 = """ +[2011-01-01, 2011-01-02] +Length: 2, Freq: D""" + exp4 = """ +[2011-01-01, ..., 2011-01-03] +Length: 3, Freq: D""" + exp5 = """ +[2011, ..., 2013] +Length: 3, Freq: A-DEC""" + exp6 = """ +[2011-01-01 09:00, ..., NaT] +Length: 3, Freq: H""" + exp7 = """ +[2013Q1] +Length: 1, Freq: Q-DEC""" + exp8 = """ +[2013Q1, 2013Q2] +Length: 2, Freq: Q-DEC""" + exp9 = """ +[2013Q1, ..., 2013Q3] +Length: 3, Freq: Q-DEC""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + def test_resolution(self): + for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], + ['day', 'day', 'day', 'day', + 'hour', 'minute', 'second', 'millisecond', 'microsecond']): + + idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) + self.assertEqual(idx.resolution, expected) + + def test_add_iadd(self): + # union + rng1 = pd.period_range('1/1/2000', freq='D', periods=5) + other1 = pd.period_range('1/6/2000', freq='D', periods=5) + expected1 = pd.period_range('1/1/2000', freq='D', periods=10) + + rng2 = pd.period_range('1/1/2000', freq='D', periods=5) + other2 = pd.period_range('1/4/2000', freq='D', periods=5) + expected2 = pd.period_range('1/1/2000', freq='D', periods=8) + + rng3 = pd.period_range('1/1/2000', freq='D', periods=5) + other3 = pd.PeriodIndex([], freq='D') + expected3 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) + other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', + '2000-01-01 11:00', '2000-01-01 12:00', + '2000-01-01 13:00', '2000-01-02 09:00', + '2000-01-02 10:00', '2000-01-02 11:00', + '2000-01-02 12:00', '2000-01-02 13:00'], + freq='H') + + rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05'], freq='T') + other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' + '2000-01-01 09:08'], freq='T') + expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05', '2000-01-01 09:08'], + freq='T') + + rng6 = pd.period_range('2000-01-01', freq='M', periods=7) + other6 = pd.period_range('2000-04-01', freq='M', periods=7) + expected6 = pd.period_range('2000-01-01', freq='M', periods=10) + + rng7 = pd.period_range('2003-01-01', freq='A', periods=5) + other7 = pd.period_range('1998-01-01', freq='A', periods=8) + expected7 = pd.period_range('1998-01-01', freq='A', periods=10) + + for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), + (rng3, other3, expected3), (rng4, other4, expected4), + (rng5, other5, expected5), (rng6, other6, expected6), + (rng7, other7, expected7)]: + + result_add = rng + other + result_union = rng.union(other) + + tm.assert_index_equal(result_add, expected) + tm.assert_index_equal(result_union, expected) + # GH 6527 + rng += other + tm.assert_index_equal(rng, expected) + + # offset + # DateOffset + rng = pd.period_range('2014', '2024', freq='A') + result = rng + pd.offsets.YearEnd(5) + expected = pd.period_range('2019', '2029', freq='A') + tm.assert_index_equal(result, expected) + rng += pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(365, 'D'), timedelta(365), Timedelta(days=365)]: + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng + o + + rng = pd.period_range('2014-01', '2016-12', freq='M') + result = rng + pd.offsets.MonthEnd(5) + expected = pd.period_range('2014-06', '2017-05', freq='M') + tm.assert_index_equal(result, expected) + rng += pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(365, 'D'), timedelta(365), Timedelta(days=365)]: + rng = pd.period_range('2014-01', '2016-12', freq='M') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng + o + + # Tick + offsets = [pd.offsets.Day(3), timedelta(days=3), np.timedelta64(3, 'D'), + pd.offsets.Hour(72), timedelta(minutes=60*24*3), + np.timedelta64(72, 'h'), Timedelta('72:00:00')] + for delta in offsets: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + result = rng + delta + expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(4, 'h'), timedelta(hours=23), Timedelta('23:00:00')]: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng + o + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), + pd.offsets.Minute(120), timedelta(minutes=120), + np.timedelta64(120, 'm'), Timedelta(minutes=120)] + for delta in offsets: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + result = rng + delta + expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', freq='H') + tm.assert_index_equal(result, expected) + rng += delta + tm.assert_index_equal(rng, expected) + + for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), + np.timedelta64(30, 's'), Timedelta(seconds=30)]: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + result = rng + delta + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng += delta + + # int + rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + result = rng + 1 + expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng += 1 + tm.assert_index_equal(rng, expected) + + def test_sub_isub(self): + # diff + rng1 = pd.period_range('1/1/2000', freq='D', periods=5) + other1 = pd.period_range('1/6/2000', freq='D', periods=5) + expected1 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng2 = pd.period_range('1/1/2000', freq='D', periods=5) + other2 = pd.period_range('1/4/2000', freq='D', periods=5) + expected2 = pd.period_range('1/1/2000', freq='D', periods=3) + + rng3 = pd.period_range('1/1/2000', freq='D', periods=5) + other3 = pd.PeriodIndex([], freq='D') + expected3 = pd.period_range('1/1/2000', freq='D', periods=5) + + rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) + other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + expected4 = rng4 + + rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', + '2000-01-01 09:05'], freq='T') + other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') + expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') + + rng6 = pd.period_range('2000-01-01', freq='M', periods=7) + other6 = pd.period_range('2000-04-01', freq='M', periods=7) + expected6 = pd.period_range('2000-01-01', freq='M', periods=3) + + rng7 = pd.period_range('2003-01-01', freq='A', periods=5) + other7 = pd.period_range('1998-01-01', freq='A', periods=8) + expected7 = pd.period_range('2006-01-01', freq='A', periods=2) + + for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), + (rng3, other3, expected3), (rng4, other4, expected4), + (rng5, other5, expected5), (rng6, other6, expected6), + (rng7, other7, expected7),]: + result_union = rng.difference(other) + tm.assert_index_equal(result_union, expected) + + # offset + # DateOffset + rng = pd.period_range('2014', '2024', freq='A') + result = rng - pd.offsets.YearEnd(5) + expected = pd.period_range('2009', '2019', freq='A') + tm.assert_index_equal(result, expected) + rng -= pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(365, 'D'), timedelta(365)]: + rng = pd.period_range('2014', '2024', freq='A') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng - o + + rng = pd.period_range('2014-01', '2016-12', freq='M') + result = rng - pd.offsets.MonthEnd(5) + expected = pd.period_range('2013-08', '2016-07', freq='M') + tm.assert_index_equal(result, expected) + rng -= pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(365, 'D'), timedelta(365)]: + rng = pd.period_range('2014-01', '2016-12', freq='M') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng - o + + # Tick + offsets = [pd.offsets.Day(3), timedelta(days=3), np.timedelta64(3, 'D'), + pd.offsets.Hour(72), timedelta(minutes=60*24*3), np.timedelta64(72, 'h')] + for delta in offsets: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + result = rng - delta + expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), pd.offsets.Minute(), + np.timedelta64(4, 'h'), timedelta(hours=23)]: + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng - o + + offsets = [pd.offsets.Hour(2), timedelta(hours=2), np.timedelta64(2, 'h'), + pd.offsets.Minute(120), timedelta(minutes=120), np.timedelta64(120, 'm')] + for delta in offsets: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + result = rng - delta + expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', freq='H') + tm.assert_index_equal(result, expected) + rng -= delta + tm.assert_index_equal(rng, expected) + + for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), np.timedelta64(30, 's')]: + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + result = rng + delta + with tm.assertRaisesRegexp(ValueError, 'Input has different freq from Period'): + rng += delta + + # int + rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + result = rng - 1 + expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= 1 + tm.assert_index_equal(rng, expected) + + def test_value_counts_unique(self): + # GH 7735 + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + + exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', + '2011-01-01 15:00', '2011-01-01 14:00', '2011-01-01 13:00', + '2011-01-01 12:00', '2011-01-01 11:00', '2011-01-01 10:00', + '2011-01-01 09:00'], freq='H') + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + tm.assert_series_equal(idx.value_counts(), expected) + + expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + tm.assert_index_equal(idx.unique(), expected) + + idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + expected = Series([3, 2], index=exp_idx) + tm.assert_series_equal(idx.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + expected = Series([3, 2, 1], index=exp_idx) + tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index a1b873e1c0bea..d3287a01cd1da 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -74,7 +74,7 @@ def test_dateindex_conversion(self): for freq in ('B', 'L', 'S'): dateindex = tm.makeDateIndex(k = 10, freq = freq) rs = self.dtc.convert(dateindex, None, None) - xp = converter.dates.date2num(dateindex) + xp = converter.dates.date2num(dateindex._mpl_repr()) np_assert_almost_equal(rs, xp, decimals) def test_resolution(self): diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py index adc2c0d237265..c2300481eca43 100644 --- a/pandas/tseries/tests/test_holiday.py +++ b/pandas/tseries/tests/test_holiday.py @@ -72,6 +72,22 @@ def test_usmemorialday(self): ] self.assertEqual(list(holidays), holidayList) + def test_non_observed_holiday(self): + july_3rd = Holiday('July 4th Eve', month=7, day=3) + result = july_3rd.dates("2001-01-01", "2003-03-03") + expected = [Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00')] + self.assertEqual(list(result), expected) + july_3rd = Holiday('July 4th Eve', month=7, day=3, + days_of_week=(0, 1, 2, 3)) + result = july_3rd.dates("2001-01-01", "2008-03-03") + expected = [Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00'), + Timestamp('2003-07-03 00:00:00'), + Timestamp('2006-07-03 00:00:00'), + Timestamp('2007-07-03 00:00:00')] + self.assertEqual(list(result), expected) + def test_easter(self): holidays = EasterMonday.dates(self.start_date, self.end_date) diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index f6f91760e8ad8..3b2e8f203c313 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1,3 +1,4 @@ +import os from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta from pandas.compat import range @@ -22,6 +23,7 @@ from pandas.tseries.tools import parse_time_string import pandas.tseries.offsets as offsets +from pandas.io.pickle import read_pickle from pandas.tslib import NaT, Timestamp import pandas.tslib as tslib from pandas.util.testing import assertRaisesRegexp @@ -848,6 +850,24 @@ def test_calendar(self): dt = datetime(2014, 1, 17) assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21)) + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = self.round_trip_pickle(obj) + self.assertEqual(unpickled, obj) + _check_roundtrip(self.offset) + _check_roundtrip(self.offset2) + _check_roundtrip(self.offset*2) + + def test_pickle_compat_0_14_1(self): + hdays = [datetime(2013,1,1) for ele in range(4)] + + pth = tm.get_data_path() + + cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + cday = CDay(holidays=hdays) + self.assertEqual(cday, cday0_14_1) + + class CustomBusinessMonthBase(object): _multiprocess_can_split_ = True @@ -894,6 +914,15 @@ def test_offsets_compare_equal(self): offset2 = self._object() self.assertFalse(offset1 != offset2) + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = self.round_trip_pickle(obj) + self.assertEqual(unpickled, obj) + _check_roundtrip(self._object()) + _check_roundtrip(self._object(2)) + _check_roundtrip(self._object()*2) + + class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): _object = CBMonthEnd @@ -1006,8 +1035,12 @@ def test_holidays(self): def test_datetimeindex(self): from pandas.tseries.holiday import USFederalHolidayCalendar - self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0], - datetime(2012,1,31)) + hcal = USFederalHolidayCalendar() + freq = CBMonthEnd(calendar=hcal) + + self.assertEqual(DatetimeIndex(start='20120101',end='20130101', + freq=freq).tolist()[0], + datetime(2012,1,31)) class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): _object = CBMonthBegin @@ -1120,8 +1153,11 @@ def test_holidays(self): self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3)) def test_datetimeindex(self): - self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0], - datetime(2012,1,3)) + hcal = USFederalHolidayCalendar() + cbmb = CBMonthBegin(calendar=hcal) + self.assertEqual(DatetimeIndex(start='20120101', end='20130101', + freq=cbmb).tolist()[0], + datetime(2012,1,3)) def assertOnOffset(offset, date, expected): @@ -3104,6 +3140,134 @@ def test_str_for_named_is_name(self): self.assertEqual(str(offset), name) +def get_utc_offset_hours(ts): + # take a Timestamp and compute total hours of utc offset + o = ts.utcoffset() + return (o.days * 24 * 3600 + o.seconds) / 3600.0 + + +class TestDST(tm.TestCase): + """ + test DateOffset additions over Daylight Savings Time + """ + # one microsecond before the DST transition + ts_pre_fallback = "2013-11-03 01:59:59.999999" + ts_pre_springfwd = "2013-03-10 01:59:59.999999" + + # test both basic names and dateutil timezones + timezone_utc_offsets = { + 'US/Eastern': dict( + utc_offset_daylight=-4, + utc_offset_standard=-5, + ), + 'dateutil/US/Pacific': dict( + utc_offset_daylight=-7, + utc_offset_standard=-8, + ) + } + valid_date_offsets_singular = [ + 'weekday', 'day', 'hour', 'minute', 'second', 'microsecond' + ] + valid_date_offsets_plural = [ + 'weeks', 'days', + 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds' + ] + + def _test_all_offsets(self, n, **kwds): + valid_offsets = self.valid_date_offsets_plural if n > 1 \ + else self.valid_date_offsets_singular + + for name in valid_offsets: + self._test_offset(offset_name=name, offset_n=n, **kwds) + + def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): + offset = DateOffset(**{offset_name: offset_n}) + t = tstart + offset + if expected_utc_offset is not None: + self.assertTrue(get_utc_offset_hours(t) == expected_utc_offset) + + if offset_name == 'weeks': + # dates should match + self.assertTrue( + t.date() == + timedelta(days=7 * offset.kwds['weeks']) + tstart.date() + ) + # expect the same day of week, hour of day, minute, second, ... + self.assertTrue( + t.dayofweek == tstart.dayofweek and + t.hour == tstart.hour and + t.minute == tstart.minute and + t.second == tstart.second + ) + elif offset_name == 'days': + # dates should match + self.assertTrue(timedelta(offset.kwds['days']) + tstart.date() == t.date()) + # expect the same hour of day, minute, second, ... + self.assertTrue( + t.hour == tstart.hour and + t.minute == tstart.minute and + t.second == tstart.second + ) + elif offset_name in self.valid_date_offsets_singular: + # expect the signular offset value to match between tstart and t + datepart_offset = getattr(t, offset_name if offset_name != 'weekday' else 'dayofweek') + self.assertTrue(datepart_offset == offset.kwds[offset_name]) + else: + # the offset should be the same as if it was done in UTC + self.assertTrue( + t == (tstart.tz_convert('UTC') + offset).tz_convert('US/Pacific') + ) + + def _make_timestamp(self, string, hrs_offset, tz): + offset_string = '{hrs:02d}00'.format(hrs=hrs_offset) if hrs_offset >= 0 else \ + '-{hrs:02d}00'.format(hrs=-1 * hrs_offset) + return Timestamp(string + offset_string).tz_convert(tz) + + def test_fallback_plural(self): + """test moving from daylight savings to standard time""" + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets['utc_offset_daylight'] + hrs_post = utc_offsets['utc_offset_standard'] + self._test_all_offsets( + n=3, + tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), + expected_utc_offset=hrs_post + ) + + def test_springforward_plural(self): + """test moving from standard to daylight savings""" + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets['utc_offset_standard'] + hrs_post = utc_offsets['utc_offset_daylight'] + self._test_all_offsets( + n=3, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=hrs_post + ) + + def test_fallback_singular(self): + # in the case of signular offsets, we dont neccesarily know which utc offset + # the new Timestamp will wind up in (the tz for 1 month may be different from 1 second) + # so we don't specify an expected_utc_offset + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets['utc_offset_standard'] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), + expected_utc_offset=None + ) + + def test_springforward_singular(self): + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets['utc_offset_standard'] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=None + ) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 6b34ae0eb9384..4a60cdbedae4d 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -59,7 +59,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40] + idx[45:99] + idx = idx[0:40].union(idx[45:99]) df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df2.plot) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index f4a96f5defab0..bd6c1766cfd61 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -146,7 +146,7 @@ def test_resample_how_callables(self): data = np.arange(5, dtype=np.int64) ind = pd.DatetimeIndex(start='2014-01-01', periods=len(data), freq='d') df = pd.DataFrame({"A": data, "B": data}, index=ind) - + def fn(x, a=1): return str(type(x)) @@ -164,7 +164,66 @@ def __call__(self, x): assert_frame_equal(df_standard, df_partial) assert_frame_equal(df_standard, df_partial2) assert_frame_equal(df_standard, df_class) - + + def test_resample_with_timedeltas(self): + + expected = DataFrame({'A' : np.arange(1480)}) + expected = expected.groupby(expected.index // 30).sum() + expected.index = pd.timedelta_range('0 days',freq='30T',periods=50) + + df = DataFrame({'A' : np.arange(1480)},index=pd.to_timedelta(np.arange(1480),unit='T')) + result = df.resample('30T',how='sum') + + assert_frame_equal(result, expected) + + def test_resample_rounding(self): + # GH 8371 + # odd results when rounding is needed + + data = """date,time,value +11-08-2014,00:00:01.093,1 +11-08-2014,00:00:02.159,1 +11-08-2014,00:00:02.667,1 +11-08-2014,00:00:03.175,1 +11-08-2014,00:00:07.058,1 +11-08-2014,00:00:07.362,1 +11-08-2014,00:00:08.324,1 +11-08-2014,00:00:08.830,1 +11-08-2014,00:00:08.982,1 +11-08-2014,00:00:09.815,1 +11-08-2014,00:00:10.540,1 +11-08-2014,00:00:11.061,1 +11-08-2014,00:00:11.617,1 +11-08-2014,00:00:13.607,1 +11-08-2014,00:00:14.535,1 +11-08-2014,00:00:15.525,1 +11-08-2014,00:00:17.960,1 +11-08-2014,00:00:20.674,1 +11-08-2014,00:00:21.191,1""" + + from pandas.compat import StringIO + df = pd.read_csv(StringIO(data), parse_dates={'timestamp': ['date', 'time']}, index_col='timestamp') + df.index.name = None + result = df.resample('6s', how='sum') + expected = DataFrame({'value' : [4,9,4,2]},index=date_range('2014-11-08',freq='6s',periods=4)) + assert_frame_equal(result,expected) + + result = df.resample('7s', how='sum') + expected = DataFrame({'value' : [4,10,4,1]},index=date_range('2014-11-08',freq='7s',periods=4)) + assert_frame_equal(result,expected) + + result = df.resample('11s', how='sum') + expected = DataFrame({'value' : [11,8]},index=date_range('2014-11-08',freq='11s',periods=2)) + assert_frame_equal(result,expected) + + result = df.resample('13s', how='sum') + expected = DataFrame({'value' : [13,6]},index=date_range('2014-11-08',freq='13s',periods=2)) + assert_frame_equal(result,expected) + + result = df.resample('17s', how='sum') + expected = DataFrame({'value' : [16,3]},index=date_range('2014-11-08',freq='17s',periods=2)) + assert_frame_equal(result,expected) + def test_resample_basic_from_daily(self): # from daily dti = DatetimeIndex( diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 769062f293cf9..282301499dcbc 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1,22 +1,29 @@ # pylint: disable-msg=E1101,W0612 +from __future__ import division from datetime import datetime, timedelta import nose import numpy as np import pandas as pd -from pandas import (Index, Series, DataFrame, Timestamp, isnull, notnull, - bdate_range, date_range) +from pandas import (Index, Series, DataFrame, Timestamp, Timedelta, TimedeltaIndex, isnull, notnull, + bdate_range, date_range, timedelta_range, Int64Index) import pandas.core.common as com -from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long, PY3_2 from pandas import compat, to_timedelta, tslib from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct from pandas.util.testing import (assert_series_equal, assert_frame_equal, assert_almost_equal, + assert_index_equal, ensure_clean) +from pandas.tseries.offsets import Day, Second, Hour import pandas.util.testing as tm +from numpy.random import rand, randn +from pandas import _np_version_under1p8 + +iNaT = tslib.iNaT class TestTimedeltas(tm.TestCase): _multiprocess_can_split_ = True @@ -24,6 +31,244 @@ class TestTimedeltas(tm.TestCase): def setUp(self): pass + def test_construction(self): + + expected = np.timedelta64(10,'D').astype('m8[ns]').view('i8') + self.assertEqual(Timedelta(10,unit='d').value, expected) + self.assertEqual(Timedelta(10.0,unit='d').value, expected) + self.assertEqual(Timedelta('10 days').value, expected) + self.assertEqual(Timedelta(days=10).value, expected) + + expected += np.timedelta64(10,'s').astype('m8[ns]').view('i8') + self.assertEqual(Timedelta('10 days 00:00:10').value, expected) + self.assertEqual(Timedelta(days=10,seconds=10).value, expected) + self.assertEqual(Timedelta(days=10,milliseconds=10*1000).value, expected) + self.assertEqual(Timedelta(days=10,microseconds=10*1000*1000).value, expected) + + # rounding cases + self.assertEqual(Timedelta(82739999850000).value, 82739999850000) + self.assertTrue('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) + self.assertEqual(Timedelta(123072001000000).value, 123072001000000) + self.assertTrue('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + + # more strings + # GH 8190 + self.assertEqual(Timedelta('1 h'), timedelta(hours=1)) + self.assertEqual(Timedelta('1 hour'), timedelta(hours=1)) + self.assertEqual(Timedelta('1 hours'), timedelta(hours=1)) + self.assertEqual(Timedelta('-1 hours'), -timedelta(hours=1)) + self.assertEqual(Timedelta('1 m'), timedelta(minutes=1)) + self.assertEqual(Timedelta('1.5 m'), timedelta(seconds=90)) + self.assertEqual(Timedelta('1 minute'), timedelta(minutes=1)) + self.assertEqual(Timedelta('1 minutes'), timedelta(minutes=1)) + self.assertEqual(Timedelta('1 s'), timedelta(seconds=1)) + self.assertEqual(Timedelta('1 second'), timedelta(seconds=1)) + self.assertEqual(Timedelta('1 seconds'), timedelta(seconds=1)) + self.assertEqual(Timedelta('1 ms'), timedelta(milliseconds=1)) + self.assertEqual(Timedelta('1 milli'), timedelta(milliseconds=1)) + self.assertEqual(Timedelta('1 millisecond'), timedelta(milliseconds=1)) + self.assertEqual(Timedelta('1 us'), timedelta(microseconds=1)) + self.assertEqual(Timedelta('1 micros'), timedelta(microseconds=1)) + self.assertEqual(Timedelta('1 microsecond'), timedelta(microseconds=1)) + self.assertEqual(Timedelta('1.5 microsecond'), Timedelta('00:00:00.000001500')) + self.assertEqual(Timedelta('1 ns'), Timedelta('00:00:00.000000001')) + self.assertEqual(Timedelta('1 nano'), Timedelta('00:00:00.000000001')) + self.assertEqual(Timedelta('1 nanosecond'), Timedelta('00:00:00.000000001')) + + # combos + self.assertEqual(Timedelta('10 days 1 hour'), timedelta(days=10,hours=1)) + self.assertEqual(Timedelta('10 days 1 h'), timedelta(days=10,hours=1)) + self.assertEqual(Timedelta('10 days 1 h 1m 1s'), timedelta(days=10,hours=1,minutes=1,seconds=1)) + self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), -timedelta(days=10,hours=1,minutes=1,seconds=1)) + self.assertEqual(Timedelta('-10 days 1 h 1m 1s'), -timedelta(days=10,hours=1,minutes=1,seconds=1)) + self.assertEqual(Timedelta('-10 days 1 h 1m 1s 3us'), -timedelta(days=10,hours=1,minutes=1,seconds=1,microseconds=3)) + self.assertEqual(Timedelta('-10 days 1 h 1.5m 1s 3us'), -timedelta(days=10,hours=1,minutes=1,seconds=31,microseconds=3)) + + # currently invalid as it has a - on the hhmmdd part (only allowed on the days) + self.assertRaises(ValueError, lambda : Timedelta('-10 days -1 h 1.5m 1s 3us')) + + # roundtripping both for string and value + for v in ['1s', + '-1s', + '1us', + '-1us', + '1 day', + '-1 day', + '-23:59:59.999999', + '-1 days +23:59:59.999999', + '-1ns', + '1ns', + '-23:59:59.999999999']: + + td = Timedelta(v) + self.assertEqual(Timedelta(td.value),td) + + # str does not normally display nanos + if not td.nanoseconds: + self.assertEqual(Timedelta(str(td)),td) + self.assertEqual(Timedelta(td._repr_base(format='all')),td) + + # floats + expected = np.timedelta64(10,'s').astype('m8[ns]').view('i8') + np.timedelta64(500,'ms').astype('m8[ns]').view('i8') + self.assertEqual(Timedelta(10.5,unit='s').value, expected) + + # nat + self.assertEqual(Timedelta('').value,iNaT) + self.assertEqual(Timedelta('nat').value,iNaT) + self.assertEqual(Timedelta('NAT').value,iNaT) + self.assertTrue(isnull(Timestamp('nat'))) + self.assertTrue(isnull(Timedelta('nat'))) + + # offset + self.assertEqual(to_timedelta(pd.offsets.Hour(2)),Timedelta('0 days, 02:00:00')) + self.assertEqual(Timedelta(pd.offsets.Hour(2)),Timedelta('0 days, 02:00:00')) + self.assertEqual(Timedelta(pd.offsets.Second(2)),Timedelta('0 days, 00:00:02')) + + # invalid + tm.assertRaisesRegexp(ValueError, + "cannot construct a TimeDelta", + lambda : Timedelta()) + tm.assertRaisesRegexp(ValueError, + "cannot create timedelta string convert", + lambda : Timedelta('foo')) + tm.assertRaisesRegexp(ValueError, + "cannot construct a TimeDelta from the passed arguments, allowed keywords are ", + lambda : Timedelta(day=10)) + + def test_repr(self): + + self.assertEqual(repr(Timedelta(10,unit='d')),"Timedelta('10 days 00:00:00')") + self.assertEqual(repr(Timedelta(10,unit='s')),"Timedelta('0 days 00:00:10')") + self.assertEqual(repr(Timedelta(10,unit='ms')),"Timedelta('0 days 00:00:00.010000')") + self.assertEqual(repr(Timedelta(-10,unit='ms')),"Timedelta('-1 days +23:59:59.990000')") + + def test_identity(self): + + td = Timedelta(10,unit='d') + self.assertTrue(isinstance(td, Timedelta)) + self.assertTrue(isinstance(td, timedelta)) + + def test_conversion(self): + + for td in [ Timedelta(10,unit='d'), Timedelta('1 days, 10:11:12.012345') ]: + self.assertTrue(td == Timedelta(td.to_pytimedelta())) + self.assertEqual(td,td.to_pytimedelta()) + self.assertEqual(td,np.timedelta64(td.value,'ns')) + + # this is NOT equal and cannot be roundtriped (because of the nanos) + td = Timedelta('1 days, 10:11:12.012345678') + self.assertTrue(td != td.to_pytimedelta()) + + def test_ops(self): + + td = Timedelta(10,unit='d') + self.assertEqual(-td,Timedelta(-10,unit='d')) + self.assertEqual(+td,Timedelta(10,unit='d')) + self.assertEqual(td - td, Timedelta(0,unit='ns')) + self.assertTrue((td - pd.NaT) is pd.NaT) + self.assertEqual(td + td, Timedelta(20,unit='d')) + self.assertTrue((td + pd.NaT) is pd.NaT) + self.assertEqual(td * 2, Timedelta(20,unit='d')) + self.assertTrue((td * pd.NaT) is pd.NaT) + self.assertEqual(td / 2, Timedelta(5,unit='d')) + self.assertEqual(abs(td), td) + self.assertEqual(abs(-td), td) + self.assertEqual(td / td, 1) + self.assertTrue((td / pd.NaT) is pd.NaT) + + # invert + self.assertEqual(-td,Timedelta('-10d')) + self.assertEqual(td * -1,Timedelta('-10d')) + self.assertEqual(-1 * td,Timedelta('-10d')) + self.assertEqual(abs(-td),Timedelta('10d')) + + # invalid + self.assertRaises(TypeError, lambda : Timedelta(11,unit='d') // 2) + + # invalid multiply with another timedelta + self.assertRaises(TypeError, lambda : td * td) + + # can't operate with integers + self.assertRaises(TypeError, lambda : td + 2) + self.assertRaises(TypeError, lambda : td - 2) + + def test_freq_conversion(self): + + td = Timedelta('1 days 2 hours 3 ns') + result = td / np.timedelta64(1,'D') + self.assertEquals(result, td.value/float(86400*1e9)) + result = td / np.timedelta64(1,'s') + self.assertEquals(result, td.value/float(1e9)) + result = td / np.timedelta64(1,'ns') + self.assertEquals(result, td.value) + + def test_fields(self): + rng = to_timedelta('1 days, 10:11:12') + self.assertEqual(rng.days,1) + self.assertEqual(rng.hours,10) + self.assertEqual(rng.minutes,11) + self.assertEqual(rng.seconds,12) + self.assertEqual(rng.milliseconds,0) + self.assertEqual(rng.microseconds,0) + self.assertEqual(rng.nanoseconds,0) + + td = Timedelta('-1 days, 10:11:12') + self.assertEqual(abs(td),Timedelta('13:48:48')) + self.assertTrue(str(td) == "-1 days +10:11:12") + self.assertEqual(-td,Timedelta('0 days 13:48:48')) + self.assertEqual(-Timedelta('-1 days, 10:11:12').value,49728000000000) + self.assertEqual(Timedelta('-1 days, 10:11:12').value,-49728000000000) + + rng = to_timedelta('-1 days, 10:11:12') + self.assertEqual(rng.days,-1) + self.assertEqual(rng.hours,10) + self.assertEqual(rng.minutes,11) + self.assertEqual(rng.seconds,12) + self.assertEqual(rng.milliseconds,0) + self.assertEqual(rng.microseconds,0) + self.assertEqual(rng.nanoseconds,0) + + # components + tup = pd.to_timedelta(-1, 'us').components + self.assertEqual(tup.days,-1) + self.assertEqual(tup.hours,23) + self.assertEqual(tup.minutes,59) + self.assertEqual(tup.seconds,59) + self.assertEqual(tup.milliseconds,999) + self.assertEqual(tup.microseconds,999) + self.assertEqual(tup.nanoseconds,0) + + tup = Timedelta('-1 days 1 us').components + self.assertEqual(tup.days,-2) + self.assertEqual(tup.hours,23) + self.assertEqual(tup.minutes,59) + self.assertEqual(tup.seconds,59) + self.assertEqual(tup.milliseconds,999) + self.assertEqual(tup.microseconds,999) + self.assertEqual(tup.nanoseconds,0) + + def test_timedelta_range(self): + + expected = to_timedelta(np.arange(5),unit='D') + result = timedelta_range('0 days',periods=5,freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(11),unit='D') + result = timedelta_range('0 days','10 days',freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(5),unit='D') + Second(2) + Day() + result = timedelta_range('1 days, 00:00:02','5 days, 00:00:02',freq='D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta([1,3,5,7,9],unit='D') + Second(2) + result = timedelta_range('1 days, 00:00:02',periods=5,freq='2D') + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(50),unit='T')*30 + result = timedelta_range('0 days',freq='30T',periods=50) + tm.assert_index_equal(result, expected) + def test_numeric_conversions(self): self.assertEqual(ct(0), np.timedelta64(0,'ns')) self.assertEqual(ct(10), np.timedelta64(10,'ns')) @@ -99,7 +344,7 @@ def conv(v): self.assertEqual(ct('06:00:01.0'), conv(np.timedelta64(6*3600+1,'s'))) self.assertEqual(ct('06:00:01.01'), conv(np.timedelta64(1000*(6*3600+1)+10,'ms'))) - self.assertEqual(ct('- 1days, 00:00:01'), -conv(d1+np.timedelta64(1,'s'))) + self.assertEqual(ct('- 1days, 00:00:01'), conv(-d1+np.timedelta64(1,'s'))) self.assertEqual(ct('1days, 06:00:01'), conv(d1+np.timedelta64(6*3600+1,'s'))) self.assertEqual(ct('1days, 06:00:01.01'), conv(d1+np.timedelta64(1000*(6*3600+1)+10,'ms'))) @@ -141,9 +386,9 @@ def conv(v): tm.assert_series_equal(result, expected) # with units - result = Series([ np.timedelta64(0,'ns'), np.timedelta64(10,'s').astype('m8[ns]') ],dtype='m8[ns]') + result = TimedeltaIndex([ np.timedelta64(0,'ns'), np.timedelta64(10,'s').astype('m8[ns]') ]) expected = to_timedelta([0,10],unit='s') - tm.assert_series_equal(result, expected) + tm.assert_index_equal(result, expected) # single element conversion v = timedelta(seconds=1) @@ -159,40 +404,40 @@ def conv(v): # arrays of various dtypes arr = np.array([1]*5,dtype='int64') result = to_timedelta(arr,unit='s') - expected = Series([ np.timedelta64(1,'s') ]*5) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(1,'s') ]*5) + tm.assert_index_equal(result, expected) arr = np.array([1]*5,dtype='int64') result = to_timedelta(arr,unit='m') - expected = Series([ np.timedelta64(1,'m') ]*5) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(1,'m') ]*5) + tm.assert_index_equal(result, expected) arr = np.array([1]*5,dtype='int64') result = to_timedelta(arr,unit='h') - expected = Series([ np.timedelta64(1,'h') ]*5) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(1,'h') ]*5) + tm.assert_index_equal(result, expected) arr = np.array([1]*5,dtype='timedelta64[s]') result = to_timedelta(arr) - expected = Series([ np.timedelta64(1,'s') ]*5) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(1,'s') ]*5) + tm.assert_index_equal(result, expected) arr = np.array([1]*5,dtype='timedelta64[D]') result = to_timedelta(arr) - expected = Series([ np.timedelta64(1,'D') ]*5) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(1,'D') ]*5) + tm.assert_index_equal(result, expected) def testit(unit, transform): # array result = to_timedelta(np.arange(5),unit=unit) - expected = Series([ np.timedelta64(i,transform(unit)) for i in np.arange(5).tolist() ]) - tm.assert_series_equal(result, expected) + expected = TimedeltaIndex([ np.timedelta64(i,transform(unit)) for i in np.arange(5).tolist() ]) + tm.assert_index_equal(result, expected) # scalar result = to_timedelta(2,unit=unit) - expected = np.timedelta64(2,transform(unit)).astype('timedelta64[ns]') - self.assert_numpy_array_equal(result,expected) + expected = Timedelta(np.timedelta64(2,transform(unit)).astype('timedelta64[ns]')) + self.assertEqual(result, expected) # validate all units # GH 6855 @@ -212,8 +457,6 @@ def testit(unit, transform): testit('L',lambda x: 'ms') # these will error - self.assertRaises(ValueError, lambda : to_timedelta(['1h'])) - self.assertRaises(ValueError, lambda : to_timedelta(['1m'])) self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo')) self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo')) @@ -228,30 +471,48 @@ def test_to_timedelta_via_apply(self): def test_timedelta_ops(self): # GH4984 - # make sure ops return timedeltas + # make sure ops return Timedelta s = Series([Timestamp('20130101') + timedelta(seconds=i*i) for i in range(10) ]) td = s.diff() - result = td.mean()[0] - # TODO This should have returned a scalar to begin with. Hack for now. + result = td.mean() expected = to_timedelta(timedelta(seconds=9)) - tm.assert_almost_equal(result, expected) + self.assertEqual(result, expected) + + result = td.to_frame().mean() + self.assertEqual(result[0], expected) result = td.quantile(.1) - # This properly returned a scalar. - expected = np.timedelta64(2599999999,'ns') - tm.assert_almost_equal(result, expected) + expected = Timedelta(np.timedelta64(2600,'ms')) + self.assertEqual(result, expected) - result = td.median()[0] - # TODO This should have returned a scalar to begin with. Hack for now. + result = td.median() expected = to_timedelta('00:00:08') - tm.assert_almost_equal(result, expected) + self.assertEqual(result, expected) + + result = td.to_frame().median() + self.assertEqual(result[0], expected) # GH 6462 # consistency in returned values for sum - result = td.sum()[0] + result = td.sum() expected = to_timedelta('00:01:21') - tm.assert_almost_equal(result, expected) + self.assertEqual(result, expected) + + result = td.to_frame().sum() + self.assertEqual(result[0], expected) + + # std + result = td.std() + expected = to_timedelta(Series(td.dropna().values).std()) + self.assertEqual(result, expected) + + result = td.to_frame().std() + self.assertEqual(result[0], expected) + + # invalid ops + for op in ['skew','kurt','sem','var','prod']: + self.assertRaises(TypeError, lambda : getattr(td,op)()) def test_timedelta_ops_scalar(self): # GH 6808 @@ -297,10 +558,10 @@ def test_to_timedelta_on_missing_values(self): assert_series_equal(actual, expected) actual = pd.to_timedelta(np.nan) - self.assertEqual(actual.astype('int64'), timedelta_NaT.astype('int64')) + self.assertEqual(actual.value, timedelta_NaT.astype('int64')) actual = pd.to_timedelta(pd.NaT) - self.assertEqual(actual.astype('int64'), timedelta_NaT.astype('int64')) + self.assertEqual(actual.value, timedelta_NaT.astype('int64')) def test_timedelta_ops_with_missing_values(self): # setup @@ -394,6 +655,567 @@ def test_apply_to_timedelta(self): # Can't compare until apply on a Series gives the correct dtype # assert_series_equal(a, b) + def test_pickle(self): + + v = Timedelta('1 days 10:11:12.0123456') + v_p = self.round_trip_pickle(v) + self.assertEqual(v,v_p) + +class TestTimedeltaIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_pass_TimedeltaIndex_to_index(self): + + rng = timedelta_range('1 days','10 days') + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pytimedelta(), dtype=object) + + self.assert_numpy_array_equal(idx.values, expected.values) + + def test_pickle(self): + + rng = timedelta_range('1 days', periods=10) + rng_p = self.round_trip_pickle(rng) + tm.assert_index_equal(rng,rng_p) + + def test_hash_error(self): + index = timedelta_range('1 days', periods=10) + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_append_join_nondatetimeindex(self): + rng = timedelta_range('1 days', periods=10) + idx = Index(['a', 'b', 'c', 'd']) + + result = rng.append(idx) + tm.assert_isinstance(result[0], Timedelta) + + # it works + rng.join(idx, how='outer') + + def test_append_numpy_bug_1681(self): + + td = timedelta_range('1 days','10 days',freq='2D') + a = DataFrame() + c = DataFrame({'A': 'foo', 'B': td}, index=td) + str(c) + + result = a.append(c) + self.assertTrue((result['B'] == td).all()) + + def test_astype(self): + rng = timedelta_range('1 days', periods=10) + + result = rng.astype('i8') + self.assert_numpy_array_equal(result, rng.asi8) + + def test_fields(self): + rng = timedelta_range('1 days, 10:11:12', periods=2, freq='s') + self.assert_numpy_array_equal(rng.days, np.array([1,1],dtype='int64')) + self.assert_numpy_array_equal(rng.hours, np.array([10,10],dtype='int64')) + self.assert_numpy_array_equal(rng.minutes, np.array([11,11],dtype='int64')) + self.assert_numpy_array_equal(rng.seconds, np.array([12,13],dtype='int64')) + self.assert_numpy_array_equal(rng.milliseconds, np.array([0,0],dtype='int64')) + self.assert_numpy_array_equal(rng.microseconds, np.array([0,0],dtype='int64')) + self.assert_numpy_array_equal(rng.nanoseconds, np.array([0,0],dtype='int64')) + + # with nat + s = Series(rng) + s[1] = np.nan + + tm.assert_series_equal(s.dt.days,Series([1,np.nan],index=[0,1])) + tm.assert_series_equal(s.dt.hours,Series([10,np.nan],index=[0,1])) + tm.assert_series_equal(s.dt.milliseconds,Series([0,np.nan],index=[0,1])) + + def test_components(self): + rng = timedelta_range('1 days, 10:11:12', periods=2, freq='s') + rng.components + + # with nat + s = Series(rng) + s[1] = np.nan + + result = s.dt.components + self.assertFalse(result.iloc[0].isnull().all()) + self.assertTrue(result.iloc[1].isnull().all()) + + def test_constructor(self): + expected = TimedeltaIndex(['1 days','1 days 00:00:05', + '2 days','2 days 00:00:02','0 days 00:00:03']) + result = TimedeltaIndex(['1 days','1 days, 00:00:05', + np.timedelta64(2,'D'), + timedelta(days=2,seconds=2), + pd.offsets.Second(3)]) + tm.assert_index_equal(result,expected) + + def test_constructor_coverage(self): + rng = timedelta_range('1 days', periods=10.5) + exp = timedelta_range('1 days', periods=10) + self.assertTrue(rng.equals(exp)) + + self.assertRaises(ValueError, TimedeltaIndex, start='1 days', + periods='foo', freq='D') + + self.assertRaises(ValueError, TimedeltaIndex, start='1 days', + end='10 days') + + self.assertRaises(ValueError, TimedeltaIndex, '1 days') + + # generator expression + gen = (timedelta(i) for i in range(10)) + result = TimedeltaIndex(gen) + expected = TimedeltaIndex([timedelta(i) for i in range(10)]) + self.assertTrue(result.equals(expected)) + + # NumPy string array + strings = np.array(['1 days', '2 days', '3 days']) + result = TimedeltaIndex(strings) + expected = to_timedelta([1,2,3],unit='d') + self.assertTrue(result.equals(expected)) + + from_ints = TimedeltaIndex(expected.asi8) + self.assertTrue(from_ints.equals(expected)) + + # non-conforming freq + self.assertRaises(ValueError, TimedeltaIndex, + ['1 days', '2 days', '4 days'], + freq='D') + + self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq='D') + + def test_constructor_name(self): + idx = TimedeltaIndex(start='1 days', periods=1, freq='D', + name='TEST') + self.assertEqual(idx.name, 'TEST') + + def test_freq_conversion(self): + + # doc example + + # series + td = Series(date_range('20130101',periods=4)) - \ + Series(date_range('20121201',periods=4)) + td[2] += timedelta(minutes=5,seconds=3) + td[3] = np.nan + + result = td / np.timedelta64(1,'D') + expected = Series([31,31,(31*86400+5*60+3)/86400.0,np.nan]) + assert_series_equal(result,expected) + + result = td.astype('timedelta64[D]') + expected = Series([31,31,31,np.nan]) + assert_series_equal(result,expected) + + result = td / np.timedelta64(1,'s') + expected = Series([31*86400,31*86400,31*86400+5*60+3,np.nan]) + assert_series_equal(result,expected) + + result = td.astype('timedelta64[s]') + assert_series_equal(result,expected) + + # tdi + td = TimedeltaIndex(td) + + result = td / np.timedelta64(1,'D') + expected = Index([31,31,(31*86400+5*60+3)/86400.0,np.nan]) + assert_index_equal(result,expected) + + result = td.astype('timedelta64[D]') + expected = Index([31,31,31,np.nan]) + assert_index_equal(result,expected) + + result = td / np.timedelta64(1,'s') + expected = Index([31*86400,31*86400,31*86400+5*60+3,np.nan]) + assert_index_equal(result,expected) + + result = td.astype('timedelta64[s]') + assert_index_equal(result,expected) + + def test_comparisons_coverage(self): + rng = timedelta_range('1 days', periods=10) + + result = rng < rng[3] + exp = np.array([True, True, True]+[False]*7) + self.assert_numpy_array_equal(result, exp) + + # raise TypeError for now + self.assertRaises(TypeError, rng.__lt__, rng[3].value) + + result = rng == list(rng) + exp = rng == rng + self.assert_numpy_array_equal(result, exp) + + def test_comparisons_nat(self): + if PY3_2: + raise nose.SkipTest('nat comparisons on 3.2 broken') + + tdidx1 = pd.TimedeltaIndex(['1 day', pd.NaT, '1 day 00:00:01', pd.NaT, + '1 day 00:00:01', '5 day 00:00:03']) + tdidx2 = pd.TimedeltaIndex(['2 day', '2 day', pd.NaT, pd.NaT, + '1 day 00:00:02', '5 days 00:00:03']) + tdarr = np.array([np.timedelta64(2,'D'), + np.timedelta64(2,'D'), + np.timedelta64('nat'), np.timedelta64('nat'), + np.timedelta64(1,'D') + np.timedelta64(2,'s'), + np.timedelta64(5,'D') + np.timedelta64(3,'s')]) + + if _np_version_under1p8: + # cannot test array because np.datetime('nat') returns today's date + cases = [(tdidx1, tdidx2)] + else: + cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] + + # Check pd.NaT is handles as the same as np.nan + for idx1, idx2 in cases: + + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + self.assert_numpy_array_equal(result, expected) + + def test_map(self): + + rng = timedelta_range('1 day', periods=10) + + f = lambda x: x.days + result = rng.map(f) + exp = [f(x) for x in rng] + self.assert_numpy_array_equal(result, exp) + + def test_misc_coverage(self): + + rng = timedelta_range('1 day', periods=5) + result = rng.groupby(rng.days) + tm.assert_isinstance(list(result.values())[0][0], Timedelta) + + idx = TimedeltaIndex(['3d','1d','2d']) + self.assertTrue(idx.equals(list(idx))) + + non_td = Index(list('abc')) + self.assertFalse(idx.equals(list(non_td))) + + def test_union(self): + + i1 = timedelta_range('1day',periods=5) + i2 = timedelta_range('3day',periods=5) + result = i1.union(i2) + expected = timedelta_range('1day',periods=7) + self.assert_numpy_array_equal(result, expected) + + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = TimedeltaIndex(start='1 day', periods=10, freq='D') + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_union_coverage(self): + + idx = TimedeltaIndex(['3d','1d','2d']) + ordered = TimedeltaIndex(idx.order(), freq='infer') + result = ordered.union(idx) + self.assertTrue(result.equals(ordered)) + + result = ordered[:0].union(ordered) + self.assertTrue(result.equals(ordered)) + self.assertEqual(result.freq, ordered.freq) + + def test_union_bug_1730(self): + + rng_a = timedelta_range('1 day', periods=4, freq='3H') + rng_b = timedelta_range('1 day', periods=4, freq='4H') + + result = rng_a.union(rng_b) + exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + self.assertTrue(result.equals(exp)) + + def test_union_bug_1745(self): + + left = TimedeltaIndex(['1 day 15:19:49.695000']) + right = TimedeltaIndex(['2 day 13:04:21.322000', + '1 day 15:27:24.873000', + '1 day 15:31:05.350000']) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + self.assertTrue(result.equals(exp)) + + def test_union_bug_4564(self): + + left = timedelta_range("1 day","30d") + right = left + pd.offsets.Minute(15) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + self.assertTrue(result.equals(exp)) + + def test_intersection_bug_1708(self): + index_1 = timedelta_range('1 day', periods=4, freq='h') + index_2 = index_1 + pd.offsets.Hour(5) + + result = index_1 & index_2 + self.assertEqual(len(result), 0) + + index_1 = timedelta_range('1 day', periods=4, freq='h') + index_2 = index_1 + pd.offsets.Hour(1) + + result = index_1 & index_2 + expected = timedelta_range('1 day 01:00:00',periods=3,freq='h') + tm.assert_index_equal(result,expected) + + def test_get_duplicates(self): + idx = TimedeltaIndex(['1 day','2 day','2 day','3 day','3day', '4day']) + + result = idx.get_duplicates() + ex = TimedeltaIndex(['2 day','3day']) + self.assertTrue(result.equals(ex)) + + def test_argmin_argmax(self): + idx = TimedeltaIndex(['1 day 00:00:05','1 day 00:00:01','1 day 00:00:02']) + self.assertEqual(idx.argmin(), 1) + self.assertEqual(idx.argmax(), 0) + + def test_order(self): + + idx = TimedeltaIndex(['4d','1d','2d']) + + ordered = idx.order() + self.assertTrue(ordered.is_monotonic) + + ordered = idx.order(ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + + ordered, dexer = idx.order(return_indexer=True) + self.assertTrue(ordered.is_monotonic) + self.assert_numpy_array_equal(dexer, [1, 2, 0]) + + ordered, dexer = idx.order(return_indexer=True, ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + self.assert_numpy_array_equal(dexer, [0, 2, 1]) + + def test_insert(self): + + idx = TimedeltaIndex(['4day','1day','2day'], name='idx') + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(['4day','1day','5day','2day'],name='idx') + self.assertTrue(result.equals(exp)) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, 'inserted') + expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), + Timedelta('2day')], name='idx') + self.assertNotIsInstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + + idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') + + # preserve freq + expected_0 = TimedeltaIndex(['1day','1day 00:00:01','1day 00:00:02','1day 00:00:03'], + name='idx', freq='s') + expected_3 = TimedeltaIndex(['1day 00:00:01','1day 00:00:02','1day 00:00:03','1day 00:00:04'], + name='idx', freq='s') + + # reset freq to None + expected_1_nofreq = TimedeltaIndex(['1day 00:00:01','1day 00:00:01','1day 00:00:02','1day 00:00:03'], + name='idx', freq=None) + expected_3_nofreq = TimedeltaIndex(['1day 00:00:01','1day 00:00:02','1day 00:00:03','1day 00:00:05'], + name='idx', freq=None) + + cases = [(0, Timedelta('1day'), expected_0), + (-3, Timedelta('1day'), expected_0), + (3, Timedelta('1day 00:00:04'), expected_3), + (1, Timedelta('1day 00:00:01'), expected_1_nofreq), + (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] + + for n, d, expected in cases: + result = idx.insert(n, d) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + def test_delete(self): + idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') + + # prserve freq + expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', name='idx') + expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', name='idx') + + # reset freq to None + expected_1 = TimedeltaIndex(['1 day','3 day','4 day', '5 day'],freq=None,name='idx') + + cases ={0: expected_0, -5: expected_0, + -1: expected_4, 4: expected_4, + 1: expected_1} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') + + # prserve freq + expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', name='idx') + expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', name='idx') + + # reset freq to None + expected_3_5 = TimedeltaIndex(['1 d','2 d','3 d', + '7 d','8 d','9 d','10d'], freq=None, name='idx') + + cases ={(0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + result = idx.delete(slice(n[0], n[-1] + 1)) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + def test_take(self): + + tds = ['1day 02:00:00','1 day 04:00:00','1 day 10:00:00'] + idx = TimedeltaIndex(start='1d',end='2d',freq='H',name='idx') + expected = TimedeltaIndex(tds, freq=None, name='idx') + + taken1 = idx.take([2, 4, 10]) + taken2 = idx[[2,4,10]] + + for taken in [taken1, taken2]: + self.assertTrue(taken.equals(expected)) + tm.assert_isinstance(taken, TimedeltaIndex) + self.assertIsNone(taken.freq) + self.assertEqual(taken.name, expected.name) + + def test_isin(self): + + index = tm.makeTimedeltaIndex(4) + result = index.isin(index) + self.assertTrue(result.all()) + + result = index.isin(list(index)) + self.assertTrue(result.all()) + + assert_almost_equal(index.isin([index[2], 5]), + [False, False, True, False]) + + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: + randn(), r_idx_type='i', c_idx_type='td') + str(df) + + cols = df.columns.join(df.index, how='outer') + joined = cols.join(df.columns) + self.assertEqual(cols.dtype, np.dtype('O')) + self.assertEqual(cols.dtype, joined.dtype) + tm.assert_index_equal(cols, joined) + + def test_slice_keeps_name(self): + + # GH4226 + dr = pd.timedelta_range('1d','5d', freq='H', name='timebucket') + self.assertEqual(dr[1:].name, dr.name) + + def test_join_self(self): + + index = timedelta_range('1 day', periods=10) + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = index.join(index, how=kind) + self.assertIs(index, joined) + + def test_factorize(self): + idx1 = TimedeltaIndex(['1 day','1 day','2 day', + '2 day','3 day','3 day']) + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = TimedeltaIndex(['1 day','2 day','3 day']) + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + # freq must be preserved + idx3 = timedelta_range('1 day', periods=4, freq='s') + exp_arr = np.array([0, 1, 2, 3]) + arr, idx = idx3.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(idx3)) + +class TestSlicing(tm.TestCase): + + def test_partial_slice(self): + rng = timedelta_range('1 day 10:11:12', freq='h',periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['5 day':'6 day'] + expected = s.iloc[86:134] + assert_series_equal(result, expected) + + result = s['5 day':] + expected = s.iloc[86:] + assert_series_equal(result, expected) + + result = s[:'6 day'] + expected = s.iloc[:134] + assert_series_equal(result, expected) + + result = s['6 days, 23:11:12'] + self.assertEqual(result, s.irow(133)) + + self.assertRaises(KeyError, s.__getitem__, '50 days') + + def test_partial_slice_high_reso(self): + + # higher reso + rng = timedelta_range('1 day 10:11:12', freq='us',periods=2000) + s = Series(np.arange(len(rng)), index=rng) + + result = s['1 day 10:11:12':] + expected = s.iloc[0:] + assert_series_equal(result, expected) + + result = s['1 day 10:11:12.001':] + expected = s.iloc[1000:] + assert_series_equal(result, expected) + + result = s['1 days, 10:11:12.001001'] + self.assertEqual(result, s.irow(1001)) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 3da97074a93fd..1980924483bfb 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -332,7 +332,6 @@ def test_dti_slicing(self): def test_pass_datetimeindex_to_index(self): # Bugs in #1396 - rng = date_range('1/1/2000', '3/1/2000') idx = Index(rng, dtype=object) @@ -2898,7 +2897,7 @@ def test_datetimeindex_diff(self): periods=100) dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), periods=98) - self.assertEqual(len(dti1.diff(dti2)), 2) + self.assertEqual(len(dti1.difference(dti2)), 2) def test_fancy_getitem(self): dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), @@ -3407,6 +3406,35 @@ def test_comparison(self): self.assertTrue(other > val) self.assertTrue(other >= val) + def test_compare_invalid(self): + + # GH 8058 + val = Timestamp('20130101 12:01:02') + self.assertFalse(val == 'foo') + self.assertFalse(val == 10.0) + self.assertFalse(val == 1) + self.assertFalse(val == long(1)) + self.assertFalse(val == []) + self.assertFalse(val == {'foo' : 1}) + self.assertFalse(val == np.float64(1)) + self.assertFalse(val == np.int64(1)) + + self.assertTrue(val != 'foo') + self.assertTrue(val != 10.0) + self.assertTrue(val != 1) + self.assertTrue(val != long(1)) + self.assertTrue(val != []) + self.assertTrue(val != {'foo' : 1}) + self.assertTrue(val != np.float64(1)) + self.assertTrue(val != np.int64(1)) + + # ops testing + df = DataFrame(randn(5,2)) + a = df[0] + b = Series(randn(5)) + b.name = Timestamp('2000-01-01') + tm.assert_series_equal(a / b, 1 / (b / a)) + def test_cant_compare_tz_naive_w_aware(self): tm._skip_if_no_pytz() # #1404 diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 5635bb75dd9ce..9fbdb714d8cfa 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -423,31 +423,98 @@ def test_with_tz_ambiguous_times(self): dr = date_range(datetime(2011, 3, 13), periods=48, freq=datetools.Minute(30), tz=pytz.utc) - def test_infer_dst(self): + def test_ambiguous_infer(self): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition tz = self.tz('US/Eastern') dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=datetools.Hour()) - self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, - tz, infer_dst=True) + self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # With repeated hours, we can infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=datetools.Hour(), tz=tz) - di = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', - '11/06/2011 01:00', '11/06/2011 02:00', - '11/06/2011 03:00']) - localized = di.tz_localize(tz, infer_dst=True) + times = ['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='infer') self.assert_numpy_array_equal(dr, localized) - + localized_old = di.tz_localize(tz, infer_dst=True) + self.assert_numpy_array_equal(dr, localized_old) + self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, ambiguous='infer')) + # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=datetools.Hour()) localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, infer_dst=True) + localized_infer = dr.tz_localize(tz, ambiguous='infer') self.assert_numpy_array_equal(localized, localized_infer) + localized_infer_old = dr.tz_localize(tz, infer_dst=True) + self.assert_numpy_array_equal(localized, localized_infer_old) + + def test_ambiguous_flags(self): + # November 6, 2011, fall back, repeat 2 AM hour + tz = self.tz('US/Eastern') + + # Pass in flags to determine right dst transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=datetools.Hour(), tz=tz) + times = ['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00'] + + # Test tz_localize + di = DatetimeIndex(times) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + self.assert_numpy_array_equal(dr, localized) + self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + self.assert_numpy_array_equal(dr, localized) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype('bool')) + self.assert_numpy_array_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) + self.assert_numpy_array_equal(dr, localized) + + # Test duplicate times where infer_dst fails + times += times + di = DatetimeIndex(times) + + # When the sizes are incompatible, make sure error is raised + self.assertRaises(Exception, di.tz_localize, tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + self.assert_numpy_array_equal(dr, localized) + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=datetools.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + self.assert_numpy_array_equal(localized, localized_is_dst) + + def test_ambiguous_nat(self): + tz = self.tz('US/Eastern') + times = ['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='NaT') + + times = ['11/06/2011 00:00', np.NaN, + np.NaN, '11/06/2011 02:00', + '11/06/2011 03:00'] + di_test = DatetimeIndex(times, tz='US/Eastern') + self.assert_numpy_array_equal(di_test, localized) # test utility methods def test_infer_tz(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 57dc5f4404621..9adcbb4ea4a41 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -138,7 +138,7 @@ def test_constructor_with_stringoffset(self): def test_repr(self): dates = ['2014-03-07', '2014-01-01 09:00', '2014-01-01 00:00:00.000000001'] - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/America/Los_Angeles'] freqs = ['D', 'M', 'S', 'N'] for date in dates: @@ -232,6 +232,26 @@ def test_tz(self): conv = local.tz_convert('US/Eastern') self.assertEqual(conv.nanosecond, 5) self.assertEqual(conv.hour, 19) + + def test_tz_localize_ambiguous(self): + + ts = Timestamp('2014-11-02 01:00') + ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) + ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) + + rng = date_range('2014-11-02', periods=3, freq='H', tz='US/Eastern') + self.assertEqual(rng[1], ts_dst) + self.assertEqual(rng[2], ts_no_dst) + self.assertRaises(ValueError, ts.tz_localize, 'US/Eastern', ambiguous='infer') + + # GH 8025 + with tm.assertRaisesRegexp(TypeError, 'Cannot localize tz-aware Timestamp, use ' + 'tz_convert for conversions'): + Timestamp('2011-01-01' ,tz='US/Eastern').tz_localize('Asia/Tokyo') + + with tm.assertRaisesRegexp(TypeError, 'Cannot convert tz-naive Timestamp, use ' + 'tz_localize to localize'): + Timestamp('2011-01-01').tz_convert('Asia/Tokyo') def test_tz_localize_roundtrip(self): for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: @@ -241,7 +261,7 @@ def test_tz_localize_roundtrip(self): localized = ts.tz_localize(tz) self.assertEqual(localized, Timestamp(t, tz=tz)) - with tm.assertRaises(Exception): + with tm.assertRaises(TypeError): localized.tz_localize(tz) reset = localized.tz_localize(None) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index e762ebe9d85cf..dc60f5024c9ed 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -12,19 +12,18 @@ is_timedelta64_dtype, _values_from_object, is_list_like, isnull, _ensure_object) -repr_timedelta = tslib.repr_timedelta64 -repr_timedelta64 = tslib.repr_timedelta64 - -def to_timedelta(arg, box=True, unit='ns'): +def to_timedelta(arg, unit='ns', box=True, coerce=False): """ Convert argument to timedelta Parameters ---------- arg : string, timedelta, array of strings (with possible NAs) - box : boolean, default True - If True returns a Series of the results, if False returns ndarray of values unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number + box : boolean, default True + If True returns a Timedelta/TimedeltaIndex of the results + if False returns a np.timedelta64 or ndarray of values of dtype timedelta64[ns] + coerce : force errors to NaT (False by default) Returns ------- @@ -34,8 +33,8 @@ def to_timedelta(arg, box=True, unit='ns'): def _convert_listlike(arg, box, unit): - if isinstance(arg, (list,tuple)): - arg = np.array(arg, dtype='O') + if isinstance(arg, (list,tuple)) or ((hasattr(arg,'__iter__') and not hasattr(arg,'dtype'))): + arg = np.array(list(arg), dtype='O') if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') @@ -45,13 +44,18 @@ def _convert_listlike(arg, box, unit): value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]') else: try: - value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit) + value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce) except: - value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit) for r in arg ]) + + # try to process strings fast; may need to fallback + try: + value = np.array([ _get_string_converter(r, unit=unit)() for r in arg ],dtype='m8[ns]') + except: + value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit, coerce=coerce) for r in arg ]) if box: - from pandas import Series - value = Series(value,dtype='m8[ns]') + from pandas import TimedeltaIndex + value = TimedeltaIndex(value,unit='ns') return value if arg is None: @@ -64,7 +68,7 @@ def _convert_listlike(arg, box, unit): return _convert_listlike(arg, box=box, unit=unit) # ...so it must be a scalar value. Return scalar. - return _coerce_scalar_to_timedelta_type(arg, unit=unit) + return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, coerce=coerce) _unit_map = { 'Y' : 'Y', @@ -92,24 +96,48 @@ def _convert_listlike(arg, box, unit): 'NS' : 'ns', 'ns' : 'ns', } +_unit_scale = { + 'd' : 86400*1e9, + 'h' : 3600*1e9, + 'm' : 60*1e9, + 's' : 1e9, + 'ms' : 1e6, + 'us' : 1e3, + 'ns' : 1, + } def _validate_timedelta_unit(arg): """ provide validation / translation for timedelta short units """ try: return _unit_map[arg] except: + if arg is None: + return 'ns' raise ValueError("invalid timedelta unit {0} provided".format(arg)) _short_search = re.compile( "^\s*(?P-?)\s*(?P\d*\.?\d*)\s*(?Pd|s|ms|us|ns)?\s*$",re.IGNORECASE) _full_search = re.compile( - "^\s*(?P-?)\s*(?P\d+)?\s*(days|d|day)?,?\s*(?P