Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Enforce the deprecation of delim_whitespace kwd in read_csv #58280

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 1 addition & 43 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
encoding_errors: str | None
dialect: str | csv.Dialect | None
on_bad_lines: str
delim_whitespace: bool | lib.NoDefault
low_memory: bool
memory_map: bool
float_precision: Literal["high", "legacy", "round_trip"] | None
Expand Down Expand Up @@ -517,15 +516,13 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):


class _C_Parser_Defaults(TypedDict):
delim_whitespace: Literal[False]
na_filter: Literal[True]
low_memory: Literal[True]
memory_map: Literal[False]
float_precision: None


_c_parser_defaults: _C_Parser_Defaults = {
"delim_whitespace": False,
"na_filter": True,
"low_memory": True,
"memory_map": False,
Expand All @@ -551,7 +548,6 @@ class _Fwf_Defaults(TypedDict):
"thousands",
"memory_map",
"dialect",
"delim_whitespace",
"quoting",
"lineterminator",
"converters",
Expand Down Expand Up @@ -783,7 +779,6 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
Expand Down Expand Up @@ -833,17 +828,6 @@ def read_csv(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -852,7 +836,6 @@ def read_csv(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -974,7 +957,6 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
Expand Down Expand Up @@ -1015,17 +997,6 @@ def read_table(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -1034,7 +1005,6 @@ def read_table(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -1341,9 +1311,8 @@ def _clean_options(
engine = "python"

sep = options["delimiter"]
delim_whitespace = options["delim_whitespace"]

if sep is None and not delim_whitespace:
if sep is None:
if engine in ("c", "pyarrow"):
fallback_reason = (
f"the '{engine}' engine does not support "
Expand All @@ -1352,7 +1321,6 @@ def _clean_options(
engine = "python"
elif sep is not None and len(sep) > 1:
if engine == "c" and sep == r"\s+":
result["delim_whitespace"] = True
del result["delimiter"]
elif engine not in ("python", "python-fwf"):
# wait until regex engine integrated
Expand All @@ -1362,9 +1330,6 @@ def _clean_options(
r"different from '\s+' are interpreted as regex)"
)
engine = "python"
elif delim_whitespace:
if "python" in engine:
result["delimiter"] = r"\s+"
elif sep is not None:
encodeable = True
encoding = sys.getfilesystemencoding() or "utf-8"
Expand Down Expand Up @@ -1779,7 +1744,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]:
def _refine_defaults_read(
dialect: str | csv.Dialect | None,
delimiter: str | None | lib.NoDefault,
delim_whitespace: bool,
engine: CSVEngine | None,
sep: str | None | lib.NoDefault,
on_bad_lines: str | Callable,
Expand Down Expand Up @@ -1862,12 +1826,6 @@ def _refine_defaults_read(
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if delimiter == "\n":
raise ValueError(
r"Specified \n as separator or delimiter. This forces the python engine "
Expand Down
35 changes: 6 additions & 29 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
(
{
"header": None,
"delim_whitespace": True,
"sep": r"\s+",
"skiprows": [0, 1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
Expand All @@ -489,7 +489,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
# gh-8983: test skipping set of rows after a row with trailing spaces.
(
{
"delim_whitespace": True,
"sep": r"\s+",
"skiprows": [1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
Expand All @@ -501,21 +501,10 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)

with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -815,25 +804,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers):
parser.read_csv(StringIO(data), names=set("QAZ"))


@xfail_pyarrow
def test_read_table_delim_whitespace_default_sep(all_parsers):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
result = parser.read_table(f, sep=r"\s+")
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)

Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/io/parser/test_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,6 @@ def test_header_missing_rows(all_parsers):
parser.read_csv(StringIO(data), header=[0, 1, 2])


# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_multiple_whitespaces(all_parsers):
# GH#54931
Expand All @@ -695,7 +694,6 @@ def test_header_multiple_whitespaces(all_parsers):
tm.assert_frame_equal(result, expected)


# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
def test_header_delim_whitespace(all_parsers):
# GH#54918
Expand All @@ -705,11 +703,7 @@ def test_header_delim_whitespace(all_parsers):
3,4
"""

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)

Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,9 +593,7 @@ def test_skiprows_inference():
""".strip()
skiprows = 2

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+")

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand All @@ -611,9 +609,7 @@ def test_skiprows_by_index_inference():
""".strip()
skiprows = [0, 2]

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+")

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand Down
21 changes: 2 additions & 19 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,29 +254,12 @@ def test_usecols_regex_sep(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
return

with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), delim_whitespace=True, usecols=("a", "b")
)
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)

Expand Down
Loading