Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: delim_whitespace kwd in read_csv #56557

Merged
merged 6 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.

.. deprecated: 2.2.0
Use ``sep="\\s+" instead.

Column and index locations and names
++++++++++++++++++++++++++++++++++++

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,7 @@ Other Deprecations
- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
- Deprecated the 'delim_whitespace' keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
Expand Down
41 changes: 35 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,9 @@
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
is set to ``True``, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -669,7 +672,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -729,7 +732,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -789,7 +792,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -849,7 +852,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -927,7 +930,7 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy"] | None = None,
Expand All @@ -944,6 +947,18 @@ def read_csv(
FutureWarning,
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand Down Expand Up @@ -1260,7 +1275,7 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: str | None = None,
Expand All @@ -1278,6 +1293,17 @@ def read_table(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand Down Expand Up @@ -2052,6 +2078,9 @@ def _refine_defaults_read(
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand Down
75 changes: 56 additions & 19 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,21 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return

result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)


Expand All @@ -515,8 +523,12 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
data = "a b c\n1 2 3"
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with pytest.raises(ValueError, match="you can only specify one"):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)


def test_read_filepath_or_buffer(all_parsers):
Expand All @@ -539,18 +551,27 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
b\n"""

expected = DataFrame({"MyColumn": list("abab")})
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data),
skipinitialspace=True,
delim_whitespace=delim_whitespace,
)
return

result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -798,12 +819,20 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
result = parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)

Expand All @@ -817,11 +846,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)


def test_read_csv_delimiter_and_sep_no_default(all_parsers):
Expand Down Expand Up @@ -858,11 +891,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)


@skip_pyarrow
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ def test_delim_whitespace_custom_terminator(c_parser_only):
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only

df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)

Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/io/parser/test_comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ def test_line_comment(all_parsers, read_kwargs, request):
#ignore this line
5.,NaN,10.0
"""
warn = None
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
warn = FutureWarning
elif read_kwargs.get("lineterminator"):
data = data.replace("\n", read_kwargs.get("lineterminator"))

Expand All @@ -55,15 +59,22 @@ def test_line_comment(all_parsers, read_kwargs, request):
else:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
msg = r"Custom line terminators not supported in python parser \(yet\)"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return

result = parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
result = parser.read_csv(StringIO(data), **read_kwargs)

expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,11 @@ def test_header_delim_whitespace(all_parsers):
3,4
"""

result = parser.read_csv(StringIO(data), delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)

Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,10 @@ def test_skiprows_inference():
101.6 956.1
""".strip()
skiprows = 2
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand All @@ -617,7 +620,10 @@ def test_skiprows_by_index_inference():
456 78 9 456
""".strip()
skiprows = [0, 2]
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,17 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request):
request.applymarker(mark)

data = data.replace("\n", lineterminator)
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)


Expand Down
Loading