Skip to content

Commit

Permalink
CLN: Remove deprecated read_csv(delim_whitespace=) (pandas-dev#58668)
Browse files Browse the repository at this point in the history
* CLN: Remove deprecated read_csv(delim_whitespace=)

* Clarify notes

* Fix some arrow failures

* Update doc/source/whatsnew/v3.0.0.rst

Co-authored-by: Abdulaziz Aloqeely <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Abdulaziz Aloqeely <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored May 15, 2024
1 parent c1234db commit 0fc0336
Show file tree
Hide file tree
Showing 14 changed files with 50 additions and 284 deletions.
2 changes: 0 additions & 2 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1511,7 +1511,6 @@ Currently, options unsupported by the C and pyarrow engines include:

* ``sep`` other than a single character (e.g. regex separators)
* ``skipfooter``
* ``sep=None`` with ``delim_whitespace=False``

Specifying any of the above options will produce a ``ParserWarning`` unless the
python engine is selected explicitly using ``engine='python'``.
Expand All @@ -1526,7 +1525,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the
* ``memory_map``
* ``dialect``
* ``on_bad_lines``
* ``delim_whitespace``
* ``quoting``
* ``lineterminator``
* ``converters``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ Removal of prior version deprecations/changes
- Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`)
- Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`)
- Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`)
- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`)
- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`)
- Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
Expand Down
1 change: 0 additions & 1 deletion pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,6 @@ class ParserWarning(Warning):
1. `sep` other than a single character (e.g. regex separators)
2. `skipfooter` higher than 0
3. `sep=None` with `delim_whitespace=False`
The warning can be avoided by adding `engine='python'` as a parameter in
`pd.read_csv` and `pd.read_table` methods.
Expand Down
5 changes: 2 additions & 3 deletions pandas/io/clipboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,8 @@ def read_clipboard(
if index_length != 0:
kwargs.setdefault("index_col", list(range(index_length)))

# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get("delim_whitespace") is None:
sep = r"\s+"
elif not isinstance(sep, str):
raise ValueError(f"{sep=} must be a string")

# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
Expand Down
73 changes: 2 additions & 71 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
encoding_errors: str | None
dialect: str | csv.Dialect | None
on_bad_lines: str
delim_whitespace: bool | lib.NoDefault
low_memory: bool
memory_map: bool
float_precision: Literal["high", "legacy", "round_trip"] | None
Expand Down Expand Up @@ -425,14 +424,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
Callable for ``engine='pyarrow'``
delim_whitespace : bool, default False
Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
is set to ``True``, nothing should be passed in for the ``delimiter``
parameter.
.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -558,15 +549,13 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):


class _C_Parser_Defaults(TypedDict):
delim_whitespace: Literal[False]
na_filter: Literal[True]
low_memory: Literal[True]
memory_map: Literal[False]
float_precision: None


_c_parser_defaults: _C_Parser_Defaults = {
"delim_whitespace": False,
"na_filter": True,
"low_memory": True,
"memory_map": False,
Expand All @@ -592,7 +581,6 @@ class _Fwf_Defaults(TypedDict):
"thousands",
"memory_map",
"dialect",
"delim_whitespace",
"quoting",
"lineterminator",
"converters",
Expand Down Expand Up @@ -818,24 +806,12 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -844,7 +820,6 @@ def read_csv(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -963,24 +938,12 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy", "round_trip"] | None = None,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

# locals() should never be modified
kwds = locals().copy()
del kwds["filepath_or_buffer"]
Expand All @@ -989,7 +952,6 @@ def read_table(
kwds_defaults = _refine_defaults_read(
dialect,
delimiter,
delim_whitespace,
engine,
sep,
on_bad_lines,
Expand Down Expand Up @@ -1296,17 +1258,10 @@ def _clean_options(
engine = "python"

sep = options["delimiter"]
delim_whitespace = options["delim_whitespace"]

if sep is None and not delim_whitespace:
if engine in ("c", "pyarrow"):
fallback_reason = (
f"the '{engine}' engine does not support "
"sep=None with delim_whitespace=False"
)
engine = "python"
elif sep is not None and len(sep) > 1:
if sep is not None and len(sep) > 1:
if engine == "c" and sep == r"\s+":
# delim_whitespace passed on to pandas._libs.parsers.TextReader
result["delim_whitespace"] = True
del result["delimiter"]
elif engine not in ("python", "python-fwf"):
Expand All @@ -1317,9 +1272,6 @@ def _clean_options(
r"different from '\s+' are interpreted as regex)"
)
engine = "python"
elif delim_whitespace:
if "python" in engine:
result["delimiter"] = r"\s+"
elif sep is not None:
encodeable = True
encoding = sys.getfilesystemencoding() or "utf-8"
Expand Down Expand Up @@ -1730,7 +1682,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]:
def _refine_defaults_read(
dialect: str | csv.Dialect | None,
delimiter: str | None | lib.NoDefault,
delim_whitespace: bool,
engine: CSVEngine | None,
sep: str | None | lib.NoDefault,
on_bad_lines: str | Callable,
Expand All @@ -1750,14 +1701,6 @@ def _refine_defaults_read(
documentation for more details.
delimiter : str or object
Alias for sep.
delim_whitespace : bool
Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.
.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand All @@ -1777,12 +1720,6 @@ def _refine_defaults_read(
-------
kwds : dict
Input parameters with correct values.
Raises
------
ValueError :
If a delimiter was specified with ``sep`` (or ``delimiter``) and
``delim_whitespace=True``.
"""
# fix types for sep, delimiter to Union(str, Any)
delim_default = defaults["delimiter"]
Expand Down Expand Up @@ -1813,12 +1750,6 @@ def _refine_defaults_read(
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if delimiter == "\n":
raise ValueError(
r"Specified \n as separator or delimiter. This forces the python engine "
Expand Down
Loading

0 comments on commit 0fc0336

Please sign in to comment.