Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: delim_whitespace kwd in read_csv #56557

Merged
merged 6 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.

.. deprecated: 2.2.0
Use ``sep="\\s+" instead.

Column and index locations and names
++++++++++++++++++++++++++++++++++++

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ Other Deprecations
- Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`)
- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
Expand Down
40 changes: 34 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,9 @@
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
is set to ``True``, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -670,7 +673,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -730,7 +733,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -790,7 +793,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -850,7 +853,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -928,7 +931,7 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy"] | None = None,
Expand Down Expand Up @@ -978,6 +981,17 @@ def read_csv(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

if verbose is not lib.no_default:
# GH#55569
warnings.warn(
Expand Down Expand Up @@ -1305,7 +1319,7 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: str | None = None,
Expand Down Expand Up @@ -1346,6 +1360,17 @@ def read_table(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

if verbose is not lib.no_default:
# GH#55569
warnings.warn(
Expand Down Expand Up @@ -2131,6 +2156,9 @@ def _refine_defaults_read(
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand Down
75 changes: 56 additions & 19 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,21 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return

result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)


Expand All @@ -515,8 +523,12 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
data = "a b c\n1 2 3"
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with pytest.raises(ValueError, match="you can only specify one"):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)


def test_read_filepath_or_buffer(all_parsers):
Expand All @@ -539,18 +551,27 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
b\n"""

expected = DataFrame({"MyColumn": list("abab")})
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data),
skipinitialspace=True,
delim_whitespace=delim_whitespace,
)
return

result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -798,12 +819,20 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
result = parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)

Expand All @@ -817,11 +846,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)


def test_read_csv_delimiter_and_sep_no_default(all_parsers):
Expand Down Expand Up @@ -858,11 +891,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)


@skip_pyarrow
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ def test_delim_whitespace_custom_terminator(c_parser_only):
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only

df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)

Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/io/parser/test_comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ def test_line_comment(all_parsers, read_kwargs, request):
#ignore this line
5.,NaN,10.0
"""
warn = None
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
warn = FutureWarning
elif read_kwargs.get("lineterminator"):
data = data.replace("\n", read_kwargs.get("lineterminator"))

Expand All @@ -55,15 +59,22 @@ def test_line_comment(all_parsers, read_kwargs, request):
else:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
msg = r"Custom line terminators not supported in python parser \(yet\)"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return

result = parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
result = parser.read_csv(StringIO(data), **read_kwargs)

expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,11 @@ def test_header_delim_whitespace(all_parsers):
3,4
"""

result = parser.read_csv(StringIO(data), delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)

Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,10 @@ def test_skiprows_inference():
101.6 956.1
""".strip()
skiprows = 2
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand All @@ -617,7 +620,10 @@ def test_skiprows_by_index_inference():
456 78 9 456
""".strip()
skiprows = [0, 2]
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,17 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request):
request.applymarker(mark)

data = data.replace("\n", lineterminator)
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)


Expand Down
Loading