Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-18394: [Python][CI] Fix nightly job using pandas dev (temporarily skip tests) #15048

Merged
merged 9 commits into from
Dec 22, 2022
2 changes: 1 addition & 1 deletion ci/scripts/install_pandas.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ else
fi

if [ "${pandas}" = "upstream_devel" ]; then
pip install git+https://github.com/pandas-dev/pandas.git --no-build-isolation
pip install git+https://github.com/pandas-dev/pandas.git
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why we did use --no-build-isolation in the first place. LGTM

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think just to be more efficient (all dependencies like numpy and cython were already installed in the env (to build pyarrow), so no need to install those in an isolation build environment). But now pandas started to depend on versioneer as build-time dependency (instead of vendoring it), so this is the cleanest solution (otherwise we would have to install versioneer first)

elif [ "${pandas}" = "nightly" ]; then
pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple --pre pandas
elif [ "${pandas}" = "latest" ]; then
Expand Down
44 changes: 25 additions & 19 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.

from datetime import datetime
import datetime
from functools import lru_cache, partial
import inspect
import itertools
Expand Down Expand Up @@ -1739,7 +1739,8 @@ def test_cast():
assert pc.cast(arr, options=allow_overflow_options) == pa.array(
[-1], type='int32')

arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
arr = pa.array(
[datetime.datetime(2010, 1, 1), datetime.datetime(2015, 1, 1)])
expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]')
assert pc.cast(arr, 'timestamp[ms]') == expected

Expand Down Expand Up @@ -1784,13 +1785,14 @@ def test_strptime():
arr = pa.array(["5/1/2020", None, "12/13/1900"])

got = pc.strptime(arr, format='%m/%d/%Y', unit='s')
expected = pa.array([datetime(2020, 5, 1), None, datetime(1900, 12, 13)],
type=pa.timestamp('s'))
expected = pa.array(
[datetime.datetime(2020, 5, 1), None, datetime.datetime(1900, 12, 13)],
type=pa.timestamp('s'))
assert got == expected
# Positional format
assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got

expected = pa.array([datetime(2020, 1, 5), None, None],
expected = pa.array([datetime.datetime(2020, 1, 5), None, None],
type=pa.timestamp('s'))
got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True)
assert got == expected
Expand Down Expand Up @@ -1933,7 +1935,11 @@ def _check_datetime_components(timestamps, timezone=None):
assert pc.subsecond(tsa).equals(pa.array(subseconds))

if ts.dt.tz:
is_dst = ts.apply(lambda x: x.dst().seconds > 0)
if ts.dt.tz is datetime.timezone.utc:
# datetime with utc returns None for dst()
is_dst = [False] * len(ts)
else:
is_dst = ts.apply(lambda x: x.dst().seconds > 0)
assert pc.is_dst(tsa).equals(pa.array(is_dst))

day_of_week_options = pc.DayOfWeekOptions(
Expand All @@ -1958,12 +1964,12 @@ def test_extract_datetime_components():
"2009-12-31T04:20:20.004132",
"2010-01-01T05:25:25.005321",
"2010-01-03T06:30:30.006163",
"2010-01-04T07:35:35",
"2006-01-01T08:40:40",
"2005-12-31T09:45:45",
"2008-12-28",
"2008-12-29",
"2012-01-01 01:02:03"]
"2010-01-04T07:35:35.0",
"2006-01-01T08:40:40.0",
"2005-12-31T09:45:45.0",
"2008-12-28T00:00:00.0",
"2008-12-29T00:00:00.0",
"2012-01-01T01:02:03.0"]
timezones = ["UTC", "US/Central", "Asia/Kolkata",
"Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"]

Expand Down Expand Up @@ -1994,12 +2000,12 @@ def test_assume_timezone():
"2009-12-31T04:20:20.004132",
"2010-01-01T05:25:25.005321",
"2010-01-03T06:30:30.006163",
"2010-01-04T07:35:35",
"2006-01-01T08:40:40",
"2005-12-31T09:45:45",
"2008-12-28",
"2008-12-29",
"2012-01-01 01:02:03"])
"2010-01-04T07:35:35.0",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just curious why are those being updated? are formats like "2008-12-28" not supported anymore?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That string itself, yes, but not the mixture of different formats in a single list (but I am planning to raise an issue about this, because it seems too pedantic, as all those strings are ISO strings)

"2006-01-01T08:40:40.0",
"2005-12-31T09:45:45.0",
"2008-12-28T00:00:00.0",
"2008-12-29T00:00:00.0",
"2012-01-01T01:02:03.0"])
nonexistent = pd.to_datetime(["2015-03-29 02:30:00",
"2015-03-29 03:30:00"])
ambiguous = pd.to_datetime(["2018-10-28 01:20:00",
Expand Down Expand Up @@ -2747,7 +2753,7 @@ def test_list_element():


def test_count_distinct():
seed = datetime.now()
seed = datetime.datetime.now()
samples = [seed.replace(year=y) for y in range(1992, 2092)]
arr = pa.array(samples, pa.timestamp("ns"))
assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64())
Expand Down
36 changes: 35 additions & 1 deletion python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ def test_column_index_names_are_preserved(self):
_check_pandas_roundtrip(df, preserve_index=True)

def test_column_index_names_with_tz(self):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
# https://github.com/pandas-dev/pandas/issues/50140
pytest.skip("Regression in pandas 2.0.0.dev")
# ARROW-13756
# Bug if index is timezone aware DataTimeIndex

Expand Down Expand Up @@ -449,6 +453,11 @@ def test_mixed_column_names(self):
preserve_index=True)

def test_binary_column_name(self):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
pytest.skip("Regression in pandas 2.0.0.dev")
column_data = ['い']
key = 'あ'.encode()
data = {key: column_data}
Expand Down Expand Up @@ -1496,7 +1505,11 @@ def test_fixed_offset_timezone(self):
pd.NaT
]
})
_check_pandas_roundtrip(df)
# 'check_dtype=False' because pandas >= 2 uses datetime.timezone
# instead of pytz.FixedOffset, and thus the dtype is not exactly
# identical (pyarrow still defaults to pytz)
# TODO remove if https://github.com/apache/arrow/issues/15047 is fixed
_check_pandas_roundtrip(df, check_dtype=False)
_check_serialize_components_roundtrip(df)

def test_timedeltas_no_nulls(self):
Expand Down Expand Up @@ -2051,6 +2064,11 @@ def test_nested_smaller_ints(self):
assert result3.equals(expected3)

def test_infer_lists(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
data = OrderedDict([
('nan_ints', [[None, 1], [2, 3]]),
('ints', [[0, 1], [2, 3]]),
Expand Down Expand Up @@ -2100,6 +2118,11 @@ def test_infer_numpy_array(self):
_check_pandas_roundtrip(df, expected_schema=expected_schema)

def test_to_list_of_structs_pandas(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
ints = pa.array([1, 2, 3], pa.int32())
strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
pa.list_(pa.string()))
Expand Down Expand Up @@ -2169,6 +2192,11 @@ def test_array_from_nested_arrays(self):
assert result.equals(expected)

def test_nested_large_list(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
s = (pa.array([[[1, 2, 3], [4]], None],
type=pa.large_list(pa.large_list(pa.int64())))
.to_pandas())
Expand Down Expand Up @@ -2877,6 +2905,12 @@ def _fully_loaded_dataframe_example():

@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
pytest.skip("Regression in pandas 2.0.0.dev")

df = pd.DataFrame(columns=columns)
table1 = pa.Table.from_pandas(df)
table2 = pa.Table.from_pandas(table1.to_pandas())
Expand Down
3 changes: 3 additions & 0 deletions python/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ faulthandler_timeout = 300
[pep8]
ignore = E211,E225,E226,E227,E402,W504
max_line_length = 88

[flake8]
max-line-length = 88