Merge pull request #4 from pandas-dev/master

merges upstream
phofl · Apr 10, 2020 · 43fca7c · 43fca7c
2 parents a1a1cb2 + b7e786e
commit 43fca7c
Show file tree

Hide file tree

Showing 596 changed files with 21,699 additions and 16,127 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 Whether you are a novice or experienced software developer, all contributions and suggestions are welcome!
 
-Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas-docs.github.io/pandas-docs-travis/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information.
+Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas.pydata.org/docs/dev/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information.
 
 ## Getting Started
 

diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,39 @@
+---
+
+name: Bug Report
+about: Create a bug report to help us improve pandas
+title: "BUG:"
+labels: "Bug, Needs Triage"
+
+---
+
+- [ ] I have checked that this issue has not already been reported.
+
+- [ ] I have confirmed this bug exists on the latest version of pandas.
+
+- [ ] (optional) I have confirmed this bug exists on the master branch of pandas.
+
+---
+
+**Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug.
+
+#### Code Sample, a copy-pastable example
+
+```python
+# Your code here
+
+```
+
+#### Problem description
+
+[this should explain **why** the current behaviour is a problem and why the expected output is a better solution]
+
+#### Expected Output
+
+#### Output of ``pd.show_versions()``
+
+<details>
+
+[paste the output of ``pd.show_versions()`` here leaving a blank line after the details tag]
+
+</details>
diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.md b/.github/ISSUE_TEMPLATE/documentation_improvement.md
@@ -0,0 +1,22 @@
+---
+
+name: Documentation Improvement
+about: Report wrong or missing documentation
+title: "DOC:"
+labels: "Docs, Needs Triage"
+
+---
+
+#### Location of the documentation
+
+[this should provide the location of the documentation, e.g. "pandas.read_csv" or the URL of the documentation, e.g. "https://dev.pandas.io/docs/reference/api/pandas.read_csv.html"]
+
+**Note**: You can check the latest versions of the docs on `master` [here](https://pandas.pydata.org/docs/dev/).
+
+#### Documentation problem
+
+[this should provide a description of what documentation you believe needs to be fixed/improved]
+
+#### Suggested fix for documentation
+
+[this should explain the suggested fix and **why** it's better than the existing documentation]
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,33 @@
+---
+
+name: Feature Request
+about: Suggest an idea for pandas
+title: "ENH:"
+labels: "Enhancement, Needs Triage"
+
+---
+
+#### Is your feature request related to a problem?
+
+[this should provide a description of what the problem is, e.g. "I wish I could use pandas to do [...]"]
+
+#### Describe the solution you'd like
+
+[this should provide a description of the feature request, e.g. "`DataFrame.foo` should get a new parameter `bar` that [...]", try to write a docstring for the desired feature]
+
+#### API breaking implications
+
+[this should provide a description of how this feature will affect the API]
+
+#### Describe alternatives you've considered
+
+[this should provide a description of any alternative solutions or features you've considered]
+
+#### Additional context
+
+[add any other context, code examples, or references to existing implementations about the feature request here]
+
+```python
+# Your code here, if applicable
+
+```
diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md
@@ -0,0 +1,24 @@
+---
+
+name: Submit Question
+about: Ask a general question about pandas
+title: "QST:"
+labels: "Usage Question, Needs Triage"
+
+---
+
+- [ ] I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) on StackOverflow for similar questions.
+
+- [ ] I have asked my usage related question on [StackOverflow](https://stackoverflow.com).
+
+---
+
+#### Question about pandas
+
+**Note**: If you'd still like to submit a question, please read [this guide](
+https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question.
+
+```python
+# Your code here, if applicable
+
+```
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -125,32 +125,18 @@ jobs:
     - name: Check ipython directive errors
       run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log"
 
-    - name: Install Rclone
-      run: sudo apt install rclone -y
-      if: github.event_name == 'push'
-
-    - name: Set up Rclone
+    - name: Install ssh key
       run: |
-        CONF=$HOME/.config/rclone/rclone.conf
-        mkdir -p `dirname $CONF`
-        echo "[ovh_host]" > $CONF
-        echo "type = swift" >> $CONF
-        echo "env_auth = false" >> $CONF
-        echo "auth_version = 3" >> $CONF
-        echo "auth = https://auth.cloud.ovh.net/v3/" >> $CONF
-        echo "endpoint_type = public" >> $CONF
-        echo "tenant_domain = default" >> $CONF
-        echo "tenant = 2977553886518025" >> $CONF
-        echo "domain = default" >> $CONF
-        echo "user = w4KGs3pmDxpd" >> $CONF
-        echo "key = ${{ secrets.ovh_object_store_key }}" >> $CONF
-        echo "region = BHS" >> $CONF
+        mkdir -m 700 -p ~/.ssh
+        echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa
+        chmod 600 ~/.ssh/id_rsa
+        echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts
       if: github.event_name == 'push'
 
-    - name: Sync web with OVH
-      run: rclone sync --exclude pandas-docs/** web/build ovh_host:prod
+    - name: Upload web
+      run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas
       if: github.event_name == 'push'
 
-    - name: Sync dev docs with OVH
-      run: rclone sync doc/build/html ovh_host:prod/pandas-docs/dev
+    - name: Upload dev docs
+      run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev
       if: github.event_name == 'push'
diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE
@@ -1,2 +1,21 @@
-YEAR: 2013-2016
-COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
+# MIT License
+
+Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -158,7 +158,7 @@ Most development discussion is taking place on github in this repo. Further, the
 
 All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
 
-A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/docs/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
 
 If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
 

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -50,6 +50,23 @@ def time_frame_op_with_scalar(self, dtype, scalar, op):
         op(self.df, scalar)
 
 
+class OpWithFillValue:
+    def setup(self):
+        # GH#31300
+        arr = np.arange(10 ** 6)
+        df = DataFrame({"A": arr})
+        ser = df["A"]
+
+        self.df = df
+        self.ser = ser
+
+    def time_frame_op_with_fill_value_no_nas(self):
+        self.df.add(self.df, fill_value=4)
+
+    def time_series_op_with_fill_value_no_nas(self):
+        self.ser.add(self.ser, fill_value=4)
+
+
 class MixedFrameWithSeriesAxis0:
     params = [
         [

diff --git a/asv_bench/benchmarks/finalize.py b/asv_bench/benchmarks/finalize.py
@@ -0,0 +1,16 @@
+import pandas as pd
+
+
+class Finalize:
+    param_names = ["series", "frame"]
+    params = [pd.Series, pd.DataFrame]
+
+    def setup(self, param):
+        N = 1000
+        obj = param(dtype=float)
+        for i in range(N):
+            obj.attrs[i] = i
+        self.obj = obj
+
+    def time_finalize_micro(self, param):
+        self.obj.__finalize__(self.obj, method="__finalize__")
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+import pandas as pd
 from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range
 
 from .pandas_vb_common import tm
@@ -118,4 +119,48 @@ def time_frame_from_range(self):
         self.df = DataFrame(self.data)
 
 
+class FromArrays:
+
+    goal_time = 0.2
+
+    def setup(self):
+        N_rows = 1000
+        N_cols = 1000
+        self.float_arrays = [np.random.randn(N_rows) for _ in range(N_cols)]
+        self.sparse_arrays = [
+            pd.arrays.SparseArray(np.random.randint(0, 2, N_rows), dtype="float64")
+            for _ in range(N_cols)
+        ]
+        self.int_arrays = [
+            pd.array(np.random.randint(1000, size=N_rows), dtype="Int64")
+            for _ in range(N_cols)
+        ]
+        self.index = pd.Index(range(N_rows))
+        self.columns = pd.Index(range(N_cols))
+
+    def time_frame_from_arrays_float(self):
+        self.df = DataFrame._from_arrays(
+            self.float_arrays,
+            index=self.index,
+            columns=self.columns,
+            verify_integrity=False,
+        )
+
+    def time_frame_from_arrays_int(self):
+        self.df = DataFrame._from_arrays(
+            self.int_arrays,
+            index=self.index,
+            columns=self.columns,
+            verify_integrity=False,
+        )
+
+    def time_frame_from_arrays_sparse(self):
+        self.df = DataFrame._from_arrays(
+            self.sparse_arrays,
+            index=self.index,
+            columns=self.columns,
+            verify_integrity=False,
+        )
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -619,4 +619,17 @@ def time_select_dtypes(self, n):
         self.df.select_dtypes(include="int")
 
 
+class MemoryUsage:
+    def setup(self):
+        self.df = DataFrame(np.random.randn(100000, 2), columns=list("AB"))
+        self.df2 = self.df.copy()
+        self.df2["A"] = self.df2["A"].astype("object")
+
+    def time_memory_usage(self):
+        self.df.memory_usage(deep=True)
+
+    def time_memory_usage_object_dtype(self):
+        self.df2.memory_usage(deep=True)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -308,6 +308,31 @@ def time_frame_getitem_single_column_int(self):
         self.df_int_col[0]
 
 
+class IndexSingleRow:
+    params = [True, False]
+    param_names = ["unique_cols"]
+
+    def setup(self, unique_cols):
+        arr = np.arange(10 ** 7).reshape(-1, 10)
+        df = DataFrame(arr)
+        dtypes = ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8", "f8", "f4"]
+        for i, d in enumerate(dtypes):
+            df[i] = df[i].astype(d)
+
+        if not unique_cols:
+            # GH#33032 single-row lookups with non-unique columns were
+            #  15x slower than with unique columns
+            df.columns = ["A", "A"] + list(df.columns[2:])
+
+        self.df = df
+
+    def time_iloc_row(self, unique_cols):
+        self.df.iloc[10000]
+
+    def time_loc_row(self, unique_cols):
+        self.df.loc[10000]
+
+
 class AssignTimeseriesIndex:
     def setup(self):
         N = 100000