diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 3d155535e25857..24a47336b05225 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -41,6 +41,7 @@ Style application Styler.set_caption Styler.set_properties Styler.set_uuid + Styler.set_na_rep Styler.clear Styler.pipe diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index e0dc2e734e660b..5e026e3a7d78fa 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -67,6 +67,7 @@ "df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n", "df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n", " axis=1)\n", + "df.iloc[3, 3] = np.nan\n", "df.iloc[0, 2] = np.nan" ] }, @@ -402,6 +403,38 @@ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can format the text displayed for missing values by `na_rep`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.format(\"{:.2%}\", na_rep=\"-\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These formatting techniques can be used in combination with styling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.highlight_max().format(None, na_rep=\"-\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -659,6 +692,7 @@ "- precision\n", "- captions\n", "- table-wide styles\n", + "- missing values representation\n", "- hiding the index or columns\n", "\n", "Each of these can be specified in two ways:\n", @@ -800,6 +834,32 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can control the default missing values representation for the entire table through `set_na_rep` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df.style\n", + " .set_na_rep(\"FAIL\")\n", + " .format(None, na_rep=\"PASS\", subset=[\"D\"])\n", + " .highlight_null(\"yellow\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f231c2b31abb13..3990eec2435d9e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -122,6 +122,7 @@ Other enhancements - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) - Roundtripping DataFrames with nullable integer or string data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6fc4e21d33d16c..ebe86a7f535cb7 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -8,7 +8,7 @@ import copy from functools import partial from itertools import product -from typing import Optional +from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple from uuid import uuid1 import numpy as np @@ -71,6 +71,11 @@ class Styler: The ``id`` takes the form ``T__row_col`` where ```` is the unique identifier, ```` is the row number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Attributes ---------- @@ -126,9 +131,10 @@ def __init__( caption=None, table_attributes=None, cell_ids=True, + na_rep: Optional[str] = None, ): - self.ctx = defaultdict(list) - self._todo = [] + self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) + self._todo: List[Tuple[Callable, Tuple, Dict]] = [] if not isinstance(data, (pd.Series, pd.DataFrame)): raise TypeError("``data`` must be a Series or DataFrame") @@ -149,19 +155,24 @@ def __init__( self.precision = precision self.table_attributes = table_attributes self.hidden_index = False - self.hidden_columns = [] + self.hidden_columns: Sequence[int] = [] self.cell_ids = cell_ids + self.na_rep = na_rep # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if is_float(x): + if self.na_rep is not None and pd.isna(x): + return self.na_rep + elif is_float(x): display_format = "{0:.{precision}f}".format(x, precision=self.precision) return display_format else: return x - self._display_funcs = defaultdict(lambda: default_display_func) + self._display_funcs: DefaultDict[ + Tuple[int, int], Callable[[Any], str] + ] = defaultdict(lambda: default_display_func) def _repr_html_(self): """ @@ -416,16 +427,22 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None): """ Format the text display value of cells. Parameters ---------- - formatter : str, callable, or dict + formatter : str, callable, dict or None + If ``formatter`` is None, the default formatter is used subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Returns ------- @@ -451,6 +468,10 @@ def format(self, formatter, subset=None): >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'c': str.upper}) """ + if formatter is None: + assert self._display_funcs.default_factory is not None + formatter = self._display_funcs.default_factory() + if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) @@ -466,16 +487,16 @@ def format(self, formatter, subset=None): if is_dict_like(formatter): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas - col_formatter = _maybe_wrap_formatter(col_formatter) + col_formatter = _maybe_wrap_formatter(col_formatter, na_rep) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with + formatter = _maybe_wrap_formatter(formatter, na_rep) locs = product(*(row_locs, col_locs)) for i, j in locs: - formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self @@ -553,6 +574,7 @@ def _copy(self, deepcopy=False): caption=self.caption, uuid=self.uuid, table_styles=self.table_styles, + na_rep=self.na_rep, ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) @@ -896,6 +918,23 @@ def set_table_styles(self, table_styles): self.table_styles = table_styles return self + def set_na_rep(self, na_rep: str) -> "Styler": + """ + Set the missing data representation on a Styler. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + na_rep : str + + Returns + ------- + self : Styler + """ + self.na_rep = na_rep + return self + def hide_index(self): """ Hide any indices from rendering. @@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter): +def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): if isinstance(formatter, str): - return lambda x: formatter.format(x) + formatter_func = lambda x: formatter.format(x) elif callable(formatter): - return formatter + formatter_func = formatter else: msg = ( "Expected a template string or callable, got {formatter} " "instead".format(formatter=formatter) ) raise TypeError(msg) + + if na_rep is None: + return formatter_func + elif isinstance(na_rep, str): + return lambda x: na_rep if pd.isna(x) else formatter_func(x) + else: + msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep) + raise TypeError(msg) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 61a3934187bd3d..5a3afb5025e511 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self): with pytest.raises(ValueError): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) + def test_format_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate() + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_set_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + ctx = ( + df.style.set_na_rep("NA") + .format(None, na_rep="-", subset=["B"]) + ._translate() + ) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "-" + + def test_format_non_numeric_na(self): + # GH 21527 28358 + df = pd.DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, pd.NaT, pd.Timestamp("20120101")], + } + ) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + assert ctx["body"][1][1]["display_value"] == "NA" + assert ctx["body"][1][2]["display_value"] == "NA" + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + def test_format_with_bad_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + with pytest.raises(TypeError): + df.style.format(None, na_rep=-1) + def test_highlight_null(self, null_color="red"): df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx