Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Remove deprecated read_csv(delim_whitespace=) #58668

Merged
merged 5 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1511,7 +1511,6 @@ Currently, options unsupported by the C and pyarrow engines include:

* ``sep`` other than a single character (e.g. regex separators)
* ``skipfooter``
* ``sep=None`` with ``delim_whitespace=False``

Specifying any of the above options will produce a ``ParserWarning`` unless the
python engine is selected explicitly using ``engine='python'``.
Expand All @@ -1526,7 +1525,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the
* ``memory_map``
* ``dialect``
* ``on_bad_lines``
* ``delim_whitespace``
* ``quoting``
* ``lineterminator``
* ``converters``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ Removal of prior version deprecations/changes
- Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`)
- Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`)
- Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`)
- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`)
- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
- Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
- Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`)
Expand Down
1 change: 0 additions & 1 deletion pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,6 @@ class ParserWarning(Warning):

1. `sep` other than a single character (e.g. regex separators)
2. `skipfooter` higher than 0
3. `sep=None` with `delim_whitespace=False`

The warning can be avoided by adding `engine='python'` as a parameter in
`pd.read_csv` and `pd.read_table` methods.
Expand Down
5 changes: 2 additions & 3 deletions pandas/io/clipboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,8 @@ def read_clipboard(
if index_length != 0:
kwargs.setdefault("index_col", list(range(index_length)))

# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get("delim_whitespace") is None:
sep = r"\s+"
elif not isinstance(sep, str):
raise ValueError(f"{sep=} must be a string")

# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
Expand Down
73 changes: 2 additions & 71 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
encoding_errors: str | None
dialect: str | csv.Dialect | None
on_bad_lines: str
delim_whitespace: bool | lib.NoDefault
low_memory: bool
memory_map: bool
float_precision: Literal["high", "legacy", "round_trip"] | None
Expand Down Expand Up @@ -425,14 +424,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):

Callable for ``engine='pyarrow'``

delim_whitespace : bool, default False
Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
is set to ``True``, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -558,15 +549,13 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):


class _C_Parser_Defaults(TypedDict):
delim_whitespace: Literal[False]
na_filter: Literal[True]
low_memory: Literal[True]
memory_map: Literal[False]
float_precision: None


_c_parser_defaults: _C_Parser_Defaults = {
"delim_whitespace": False,
"na_filter": True,
"low_memory": True,
"memory_map": False,
Expand All @@ -592,7 +581,6 @@ class _Fwf_Defaults(TypedDict):
"thousands",
"memory_map",
"dialect",
"delim_whitespace",
"quoting",
"lineterminator",
"converters",
Expand Down Expand Up @@ -818,24 +806,12 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -844,7 +820,6 @@ def read_csv(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -963,24 +938,12 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -989,7 +952,6 @@ def read_table(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -1296,17 +1258,10 @@ def _clean_options(
engine = "python"

sep = options["delimiter"]
delim_whitespace = options["delim_whitespace"]

if sep is None and not delim_whitespace:
if engine in ("c", "pyarrow"):
fallback_reason = (
f"the '{engine}' engine does not support "
"sep=None with delim_whitespace=False"
)
engine = "python"
elif sep is not None and len(sep) > 1:
if sep is not None and len(sep) > 1:
if engine == "c" and sep == r"\s+":
# delim_whitespace passed on to pandas._libs.parsers.TextReader
result["delim_whitespace"] = True
del result["delimiter"]
elif engine not in ("python", "python-fwf"):
Expand All @@ -1317,9 +1272,6 @@ def _clean_options(
r"different from '\s+' are interpreted as regex)"
)
engine = "python"
elif delim_whitespace:
if "python" in engine:
result["delimiter"] = r"\s+"
elif sep is not None:
encodeable = True
encoding = sys.getfilesystemencoding() or "utf-8"
Expand Down Expand Up @@ -1730,7 +1682,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]:
def _refine_defaults_read(
dialect: str | csv.Dialect | None,
delimiter: str | None | lib.NoDefault,
delim_whitespace: bool,
engine: CSVEngine | None,
sep: str | None | lib.NoDefault,
on_bad_lines: str | Callable,
Expand All @@ -1750,14 +1701,6 @@ def _refine_defaults_read(
documentation for more details.
delimiter : str or object
Alias for sep.
delim_whitespace : bool
Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.

.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand All @@ -1777,12 +1720,6 @@ def _refine_defaults_read(
-------
kwds : dict
Input parameters with correct values.

Raises
------
ValueError :
If a delimiter was specified with ``sep`` (or ``delimiter``) and
``delim_whitespace=True``.
"""
# fix types for sep, delimiter to Union(str, Any)
delim_default = defaults["delimiter"]
Expand Down Expand Up @@ -1813,12 +1750,6 @@ def _refine_defaults_read(
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if delimiter == "\n":
raise ValueError(
r"Specified \n as separator or delimiter. This forces the python engine "
Expand Down
Loading