Skip to content

Commit

Permalink
ENH: Add built-in function for Styler to format the text displayed fo…
Browse files Browse the repository at this point in the history
…r missing values (pandas-dev#29118)

* Add built-in funcion for Styler to format the text displayed for missing values

As described in GH pandas-dev#28358, user who wants to control how NA values are printed
while applying styles to the output will have to implement their own formatter.
(so that the underlying data will not change and can be used for styling)
  • Loading branch information
immaxchen authored and proost committed Dec 19, 2019
1 parent 99995d4 commit 49b664c
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 13 deletions.
1 change: 1 addition & 0 deletions doc/source/reference/style.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Style application
Styler.set_caption
Styler.set_properties
Styler.set_uuid
Styler.set_na_rep
Styler.clear
Styler.pipe

Expand Down
60 changes: 60 additions & 0 deletions doc/source/user_guide/style.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n",
"df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n",
" axis=1)\n",
"df.iloc[3, 3] = np.nan\n",
"df.iloc[0, 2] = np.nan"
]
},
Expand Down Expand Up @@ -402,6 +403,38 @@
"df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can format the text displayed for missing values by `na_rep`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.style.format(\"{:.2%}\", na_rep=\"-\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"These formatting techniques can be used in combination with styling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.style.highlight_max().format(None, na_rep=\"-\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -659,6 +692,7 @@
"- precision\n",
"- captions\n",
"- table-wide styles\n",
"- missing values representation\n",
"- hiding the index or columns\n",
"\n",
"Each of these can be specified in two ways:\n",
Expand Down Expand Up @@ -800,6 +834,32 @@
"We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can control the default missing values representation for the entire table through `set_na_rep` method."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(df.style\n",
" .set_na_rep(\"FAIL\")\n",
" .format(None, na_rep=\"PASS\", subset=[\"D\"])\n",
" .highlight_null(\"yellow\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Other enhancements
- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
- Roundtripping DataFrames with nullable integer or string data types to parquet
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
Expand Down
73 changes: 60 additions & 13 deletions pandas/io/formats/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import copy
from functools import partial
from itertools import product
from typing import Optional
from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple
from uuid import uuid1

import numpy as np
Expand Down Expand Up @@ -71,6 +71,11 @@ class Styler:
The ``id`` takes the form ``T_<uuid>_row<num_row>_col<num_col>``
where ``<uuid>`` is the unique identifier, ``<num_row>`` is the row
number and ``<num_col>`` is the column number.
na_rep : str, optional
Representation for missing values.
If ``na_rep`` is None, no special formatting is applied
.. versionadded:: 1.0.0
Attributes
----------
Expand Down Expand Up @@ -126,9 +131,10 @@ def __init__(
caption=None,
table_attributes=None,
cell_ids=True,
na_rep: Optional[str] = None,
):
self.ctx = defaultdict(list)
self._todo = []
self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list)
self._todo: List[Tuple[Callable, Tuple, Dict]] = []

if not isinstance(data, (pd.Series, pd.DataFrame)):
raise TypeError("``data`` must be a Series or DataFrame")
Expand All @@ -149,19 +155,24 @@ def __init__(
self.precision = precision
self.table_attributes = table_attributes
self.hidden_index = False
self.hidden_columns = []
self.hidden_columns: Sequence[int] = []
self.cell_ids = cell_ids
self.na_rep = na_rep

# display_funcs maps (row, col) -> formatting function

def default_display_func(x):
if is_float(x):
if self.na_rep is not None and pd.isna(x):
return self.na_rep
elif is_float(x):
display_format = "{0:.{precision}f}".format(x, precision=self.precision)
return display_format
else:
return x

self._display_funcs = defaultdict(lambda: default_display_func)
self._display_funcs: DefaultDict[
Tuple[int, int], Callable[[Any], str]
] = defaultdict(lambda: default_display_func)

def _repr_html_(self):
"""
Expand Down Expand Up @@ -416,16 +427,22 @@ def format_attr(pair):
table_attributes=table_attr,
)

def format(self, formatter, subset=None):
def format(self, formatter, subset=None, na_rep: Optional[str] = None):
"""
Format the text display value of cells.
Parameters
----------
formatter : str, callable, or dict
formatter : str, callable, dict or None
If ``formatter`` is None, the default formatter is used
subset : IndexSlice
An argument to ``DataFrame.loc`` that restricts which elements
``formatter`` is applied to.
na_rep : str, optional
Representation for missing values.
If ``na_rep`` is None, no special formatting is applied
.. versionadded:: 1.0.0
Returns
-------
Expand All @@ -451,6 +468,10 @@ def format(self, formatter, subset=None):
>>> df['c'] = ['a', 'b', 'c', 'd']
>>> df.style.format({'c': str.upper})
"""
if formatter is None:
assert self._display_funcs.default_factory is not None
formatter = self._display_funcs.default_factory()

if subset is None:
row_locs = range(len(self.data))
col_locs = range(len(self.data.columns))
Expand All @@ -466,16 +487,16 @@ def format(self, formatter, subset=None):
if is_dict_like(formatter):
for col, col_formatter in formatter.items():
# formatter must be callable, so '{}' are converted to lambdas
col_formatter = _maybe_wrap_formatter(col_formatter)
col_formatter = _maybe_wrap_formatter(col_formatter, na_rep)
col_num = self.data.columns.get_indexer_for([col])[0]

for row_num in row_locs:
self._display_funcs[(row_num, col_num)] = col_formatter
else:
# single scalar to format all cells with
formatter = _maybe_wrap_formatter(formatter, na_rep)
locs = product(*(row_locs, col_locs))
for i, j in locs:
formatter = _maybe_wrap_formatter(formatter)
self._display_funcs[(i, j)] = formatter
return self

Expand Down Expand Up @@ -553,6 +574,7 @@ def _copy(self, deepcopy=False):
caption=self.caption,
uuid=self.uuid,
table_styles=self.table_styles,
na_rep=self.na_rep,
)
if deepcopy:
styler.ctx = copy.deepcopy(self.ctx)
Expand Down Expand Up @@ -896,6 +918,23 @@ def set_table_styles(self, table_styles):
self.table_styles = table_styles
return self

def set_na_rep(self, na_rep: str) -> "Styler":
"""
Set the missing data representation on a Styler.
.. versionadded:: 1.0.0
Parameters
----------
na_rep : str
Returns
-------
self : Styler
"""
self.na_rep = na_rep
return self

def hide_index(self):
"""
Hide any indices from rendering.
Expand Down Expand Up @@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None):
return non_zero_lengths


def _maybe_wrap_formatter(formatter):
def _maybe_wrap_formatter(formatter, na_rep: Optional[str]):
if isinstance(formatter, str):
return lambda x: formatter.format(x)
formatter_func = lambda x: formatter.format(x)
elif callable(formatter):
return formatter
formatter_func = formatter
else:
msg = (
"Expected a template string or callable, got {formatter} "
"instead".format(formatter=formatter)
)
raise TypeError(msg)

if na_rep is None:
return formatter_func
elif isinstance(na_rep, str):
return lambda x: na_rep if pd.isna(x) else formatter_func(x)
else:
msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep)
raise TypeError(msg)
69 changes: 69 additions & 0 deletions pandas/tests/io/formats/test_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self):
with pytest.raises(ValueError):
df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"])

def test_format_with_na_rep(self):
# GH 21527 28358
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])

ctx = df.style.format(None, na_rep="-")._translate()
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"

ctx = df.style.format("{:.2%}", na_rep="-")._translate()
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "110.00%"
assert ctx["body"][1][2]["display_value"] == "120.00%"

ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate()
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "120.00%"

def test_init_with_na_rep(self):
# GH 21527 28358
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])

ctx = Styler(df, na_rep="NA")._translate()
assert ctx["body"][0][1]["display_value"] == "NA"
assert ctx["body"][0][2]["display_value"] == "NA"

def test_set_na_rep(self):
# GH 21527 28358
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])

ctx = df.style.set_na_rep("NA")._translate()
assert ctx["body"][0][1]["display_value"] == "NA"
assert ctx["body"][0][2]["display_value"] == "NA"

ctx = (
df.style.set_na_rep("NA")
.format(None, na_rep="-", subset=["B"])
._translate()
)
assert ctx["body"][0][1]["display_value"] == "NA"
assert ctx["body"][0][2]["display_value"] == "-"

def test_format_non_numeric_na(self):
# GH 21527 28358
df = pd.DataFrame(
{
"object": [None, np.nan, "foo"],
"datetime": [None, pd.NaT, pd.Timestamp("20120101")],
}
)

ctx = df.style.set_na_rep("NA")._translate()
assert ctx["body"][0][1]["display_value"] == "NA"
assert ctx["body"][0][2]["display_value"] == "NA"
assert ctx["body"][1][1]["display_value"] == "NA"
assert ctx["body"][1][2]["display_value"] == "NA"

ctx = df.style.format(None, na_rep="-")._translate()
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "-"

def test_format_with_bad_na_rep(self):
# GH 21527 28358
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
with pytest.raises(TypeError):
df.style.format(None, na_rep=-1)

def test_highlight_null(self, null_color="red"):
df = pd.DataFrame({"A": [0, np.nan]})
result = df.style.highlight_null()._compute().ctx
Expand Down

0 comments on commit 49b664c

Please sign in to comment.