apache · jorisvandenbossche · Dec 22, 2022 · Dec 20, 2022 · Dec 20, 2022 · Dec 20, 2022
diff --git a/ci/scripts/install_pandas.sh b/ci/scripts/install_pandas.sh
@@ -36,7 +36,7 @@ else
 fi
 
 if [ "${pandas}" = "upstream_devel" ]; then
-  pip install git+https://github.com/pandas-dev/pandas.git --no-build-isolation
+  pip install git+https://github.com/pandas-dev/pandas.git
 elif [ "${pandas}" = "nightly" ]; then
   pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre pandas
 elif [ "${pandas}" = "latest" ]; then

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from datetime import datetime
+import datetime
 from functools import lru_cache, partial
 import inspect
 import itertools
@@ -1739,7 +1739,8 @@ def test_cast():
     assert pc.cast(arr, options=allow_overflow_options) == pa.array(
         [-1], type='int32')
 
-    arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
+    arr = pa.array(
+        [datetime.datetime(2010, 1, 1), datetime.datetime(2015, 1, 1)])
     expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
     assert pc.cast(arr, 'timestamp[ms]') == expected
 
@@ -1784,13 +1785,14 @@ def test_strptime():
     arr = pa.array(["5/1/2020", None, "12/13/1900"])
 
     got = pc.strptime(arr, format='%m/%d/%Y', unit='s')
-    expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)],
-                        type=pa.timestamp('s'))
+    expected = pa.array(
+        [datetime.datetime(2020, 5, 1), None, datetime.datetime(1900, 12, 13)],
+        type=pa.timestamp('s'))
     assert got == expected
     # Positional format
     assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got
 
-    expected = pa.array([datetime(2020, 1, 5), None, None],
+    expected = pa.array([datetime.datetime(2020, 1, 5), None, None],
                         type=pa.timestamp('s'))
     got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True)
     assert got == expected
@@ -1933,7 +1935,11 @@ def _check_datetime_components(timestamps, timezone=None):
     assert pc.subsecond(tsa).equals(pa.array(subseconds))
 
     if ts.dt.tz:
-        is_dst = ts.apply(lambda x: x.dst().seconds > 0)
+        if ts.dt.tz is datetime.timezone.utc:
+            # datetime with utc returns None for dst()
+            is_dst = [False] * len(ts)
+        else:
+            is_dst = ts.apply(lambda x: x.dst().seconds > 0)
         assert pc.is_dst(tsa).equals(pa.array(is_dst))
 
     day_of_week_options = pc.DayOfWeekOptions(
@@ -1958,12 +1964,12 @@ def test_extract_datetime_components():
                   "2009-12-31T04:20:20.004132",
                   "2010-01-01T05:25:25.005321",
                   "2010-01-03T06:30:30.006163",
-                  "2010-01-04T07:35:35",
-                  "2006-01-01T08:40:40",
-                  "2005-12-31T09:45:45",
-                  "2008-12-28",
-                  "2008-12-29",
-                  "2012-01-01 01:02:03"]
+                  "2010-01-04T07:35:35.0",
+                  "2006-01-01T08:40:40.0",
+                  "2005-12-31T09:45:45.0",
+                  "2008-12-28T00:00:00.0",
+                  "2008-12-29T00:00:00.0",
+                  "2012-01-01T01:02:03.0"]
     timezones = ["UTC", "US/Central", "Asia/Kolkata",
                  "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"]
 
@@ -1994,12 +2000,12 @@ def test_assume_timezone():
                                  "2009-12-31T04:20:20.004132",
                                  "2010-01-01T05:25:25.005321",
                                  "2010-01-03T06:30:30.006163",
-                                 "2010-01-04T07:35:35",
-                                 "2006-01-01T08:40:40",
-                                 "2005-12-31T09:45:45",
-                                 "2008-12-28",
-                                 "2008-12-29",
-                                 "2012-01-01 01:02:03"])
+                                 "2010-01-04T07:35:35.0",
+                                 "2006-01-01T08:40:40.0",
+                                 "2005-12-31T09:45:45.0",
+                                 "2008-12-28T00:00:00.0",
+                                 "2008-12-29T00:00:00.0",
+                                 "2012-01-01T01:02:03.0"])
     nonexistent = pd.to_datetime(["2015-03-29 02:30:00",
                                   "2015-03-29 03:30:00"])
     ambiguous = pd.to_datetime(["2018-10-28 01:20:00",
@@ -2747,7 +2753,7 @@ def test_list_element():
 
 
 def test_count_distinct():
-    seed = datetime.now()
+    seed = datetime.datetime.now()
     samples = [seed.replace(year=y) for y in range(1992, 2092)]
     arr = pa.array(samples, pa.timestamp("ns"))
     assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64())

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -187,6 +187,10 @@ def test_column_index_names_are_preserved(self):
         _check_pandas_roundtrip(df, preserve_index=True)
 
     def test_column_index_names_with_tz(self):
+        if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
+            # TODO: regression in pandas, should be fixed before final 2.0.0
+            # https://github.com/pandas-dev/pandas/issues/50140
+            pytest.skip("Regression in pandas 2.0.0.dev")
         # ARROW-13756
         # Bug if index is timezone aware DataTimeIndex
 
@@ -449,6 +453,11 @@ def test_mixed_column_names(self):
                                         preserve_index=True)
 
     def test_binary_column_name(self):
+        if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
+            # TODO: regression in pandas, should be fixed before final 2.0.0
+            # https://issues.apache.org/jira/browse/ARROW-18394
+            # https://github.com/pandas-dev/pandas/issues/50127
+            pytest.skip("Regression in pandas 2.0.0.dev")
         column_data = ['い']
         key = 'あ'.encode()
         data = {key: column_data}
@@ -1496,7 +1505,11 @@ def test_fixed_offset_timezone(self):
                 pd.NaT
             ]
         })
-        _check_pandas_roundtrip(df)
+        # 'check_dtype=False' because pandas >= 2 uses datetime.timezone
+        # instead of pytz.FixedOffset, and thus the dtype is not exactly
+        # identical (pyarrow still defaults to pytz)
+        # TODO remove if https://github.com/apache/arrow/issues/15047 is fixed
+        _check_pandas_roundtrip(df, check_dtype=False)
         _check_serialize_components_roundtrip(df)
 
     def test_timedeltas_no_nulls(self):
@@ -2051,6 +2064,11 @@ def test_nested_smaller_ints(self):
         assert result3.equals(expected3)
 
     def test_infer_lists(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
         data = OrderedDict([
             ('nan_ints', [[None, 1], [2, 3]]),
             ('ints', [[0, 1], [2, 3]]),
@@ -2100,6 +2118,11 @@ def test_infer_numpy_array(self):
         _check_pandas_roundtrip(df, expected_schema=expected_schema)
 
     def test_to_list_of_structs_pandas(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
         ints = pa.array([1, 2, 3], pa.int32())
         strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
                            pa.list_(pa.string()))
@@ -2169,6 +2192,11 @@ def test_array_from_nested_arrays(self):
             assert result.equals(expected)
 
     def test_nested_large_list(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
         s = (pa.array([[[1, 2, 3], [4]], None],
                       type=pa.large_list(pa.large_list(pa.int64())))
              .to_pandas())
@@ -2877,6 +2905,12 @@ def _fully_loaded_dataframe_example():
 
 @pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
 def test_roundtrip_with_bytes_unicode(columns):
+    if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
+        # TODO: regression in pandas, should be fixed before final 2.0.0
+        # https://issues.apache.org/jira/browse/ARROW-18394
+        # https://github.com/pandas-dev/pandas/issues/50127
+        pytest.skip("Regression in pandas 2.0.0.dev")
+
     df = pd.DataFrame(columns=columns)
     table1 = pa.Table.from_pandas(df)
     table2 = pa.Table.from_pandas(table1.to_pandas())

diff --git a/python/setup.cfg b/python/setup.cfg
@@ -34,3 +34,6 @@ faulthandler_timeout = 300
 [pep8]
 ignore = E211,E225,E226,E227,E402,W504
 max_line_length = 88
+
+[flake8]
+max-line-length = 88