Skip to content

Commit

Permalink
Merge pull request #333 from pandas-profiling/rejected
Browse files Browse the repository at this point in the history
Rejected and constant variable support
  • Loading branch information
sbrugman authored Jan 14, 2020
2 parents 27f4700 + ae5166b commit 609c029
Show file tree
Hide file tree
Showing 11 changed files with 130 additions and 38 deletions.
7 changes: 7 additions & 0 deletions src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ def get_description(self) -> dict:
"""
return self.description_set

def get_rejected_variables() -> list:
return [
message.column_name
for message in self.description_set["messages"]
if message.message_type == MessageType.REJECTED
]

def to_file(self, output_file: Path, silent: bool = True) -> None:
"""Write the report to a file.
Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_dark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
12 changes: 0 additions & 12 deletions src/pandas_profiling/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,9 @@ class Variable(Enum):

TYPE_COMPLEX = "COMPLEX"

S_TYPE_CONST = "CONST"
"""A constant variable"""

S_TYPE_UNIQUE = "UNIQUE"
"""An unique variable"""

S_TYPE_UNSUPPORTED = "UNSUPPORTED"
"""An unsupported variable"""

S_TYPE_CORR = "CORR"
"""A highly correlated variable"""

S_TYPE_REJECTED = "REJECTED"
"""A rejected variable"""


# Temporary mapping
Boolean = Variable.TYPE_BOOL
Expand Down
56 changes: 53 additions & 3 deletions src/pandas_profiling/model/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class MessageType(Enum):
"""Message Types"""

CONST = 1
CONSTANT = 1
"""This variable has a constant value."""

ZEROS = 2
Expand Down Expand Up @@ -52,6 +52,13 @@ class MessageType(Enum):
"""This variable is likely a datetime, but treated as categorical."""

UNIQUE = 12
"""This variable has unique values."""

CONSTANT_LENGTH = 13
"""This variable has a constant length"""

REJECTED = 15
"""Variables are rejected if we do not want to consider them for further analysis."""

UNIFORM = 14
"""The variable is uniformly distributed"""
Expand All @@ -76,6 +83,15 @@ def __init__(
self.column_name = column_name
self.anchor_id = hash(column_name)

def fmt(self):
# TODO: render in template
name = self.message_type.name.replace("_", " ")
if name == "HIGH CORRELATION":
name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format(
num=len(self.values["fields"]), title=", ".join(self.values["fields"])
)
return name

def __repr__(self):
return "[{message_type}] warning on column {column}".format(
message_type=self.message_type.name, column=self.column_name
Expand Down Expand Up @@ -141,12 +157,21 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
messages.append(
Message(
column_name=col,
message_type=MessageType.CONST,
message_type=MessageType.CONSTANT,
values=description,
fields={"n_unique"},
)
)

messages.append(
Message(
column_name=col,
message_type=MessageType.REJECTED,
values=description,
fields={},
)
)

if description["distinct_count_without_nan"] == description["n"]:
messages.append(
Message(
Expand All @@ -168,8 +193,19 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
)
)

# Unsupported
if description["type"] == Variable.S_TYPE_UNSUPPORTED:
messages.append(
Message(
column_name=col,
message_type=MessageType.REJECTED,
values=description,
fields={},
)
)

# Categorical
if description["type"] in {Variable.TYPE_CAT}:
if description["type"] == Variable.TYPE_CAT:
if description["date_warning"]:
messages.append(
Message(column_name=col, message_type=MessageType.TYPE_DATE, values={})
Expand Down Expand Up @@ -197,6 +233,20 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
)
)

# Constant length
if (
"composition" in description
and description["min_length"] == description["max_length"]
):
messages.append(
Message(
column_name=col,
message_type=MessageType.CONSTANT_LENGTH,
values=description,
fields={"composition_min_length", "composition_max_length"},
)
)

# Numerical
if description["type"] == Variable.TYPE_NUM:
# Skewness
Expand Down
11 changes: 10 additions & 1 deletion src/pandas_profiling/report/presentation/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@

class Dataset(ItemRenderer):
def __init__(
self, package, date_start, date_end, values, messages, variables, **kwargs
self,
package,
date_start,
date_end,
values,
messages,
collapse_warnings,
variables,
**kwargs
):
super().__init__(
"dataset",
Expand All @@ -15,6 +23,7 @@ def __init__(
"values": values,
"messages": messages,
"variables": variables,
"collapse_warnings": collapse_warnings,
"package": package,
},
**kwargs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
</a>
</div>

<div id="reproduction" class="row collapse">
<div id="reproduction" class="row collapse" aria-expanded="false">
<div class="col-sm-12">
<p class="h2">Reproduction info</p>
<table class="table table-condensed stats">
Expand Down Expand Up @@ -88,13 +88,13 @@
{% if messages %}
<div class="col-sm-12 text-right">
<a role="button" data-toggle="collapse" data-target="#warnings"
aria-expanded="true" aria-controls="collapseExample" class="">
aria-expanded="{% if collapse_warnings %}false{% else %}true{% endif %}" aria-controls="collapseExample" class="">
Toggle Warnings
</a>
</div>

<div id="warnings" class="row collapse in" aria-expanded="true">
<div class="col-sm-12" style="padding-left: 1em;">
<div id="warnings" class="row collapse{% if not collapse_warnings %} in{% endif %}" aria-expanded="{% if collapse_warnings %}false{% else %}true{% endif %}">
<div class="col-sm-12" style="padding-left: 1em;}">
<p class="h2">Warnings</p>
<table class="table table-condensed list-warnings">
{% for message in messages %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
<a class="anchor" href="#pp_var_{{ message.anchor_id }}"><code>{{ message.column_name }}</code></a> is an unsupported type, check if it needs cleaning or further analysis
</td>
<td>
<span class="label label-warning">Warning</span>
<span class="label label-warning">Rejected</span>
</td>
44 changes: 28 additions & 16 deletions src/pandas_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
ImagePath,
Generic,
)
from pandas_profiling.model.messages import MessageType
from pandas_profiling.report.structure.variables import (
render_boolean,
render_categorical,
Expand Down Expand Up @@ -122,47 +123,51 @@ def render_variables_section(dataframe_summary: dict) -> list:
templs = []

for idx, summary in dataframe_summary["variables"].items():
# TODO: move to render
# Common template variables
def fmt_warning(warning):
name = warning.message_type.name.replace("_", " ")
if name == "HIGH CORRELATION":
name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format(
num=len(warning.values["fields"]),
title=", ".join(warning.values["fields"]),
)
return name

warnings = [
fmt_warning(warning)
warning.fmt()
for warning in dataframe_summary["messages"]
if warning.column_name == idx
]
warn_fields = [

warning_fields = {
field
for warning in dataframe_summary["messages"]
if warning.column_name == idx
for field in warning.fields
]
}

warning_types = {
warning.message_type
for warning in dataframe_summary["messages"]
if warning.column_name == idx
}

template_variables = {
"varname": idx,
"varid": hash(idx),
"warnings": warnings,
"warn_fields": warn_fields,
"warn_fields": warning_fields,
}

template_variables.update(summary)

# Per type template variables
template_variables.update(type_to_func[summary["type"]](template_variables))

# Ignore these
if config["reject_variables"].get(bool):
ignore = MessageType.REJECTED in warning_types
else:
ignore = False

templs.append(
Preview(
template_variables["top"],
template_variables["bottom"],
anchor_id=template_variables["varid"],
name=idx,
ignore="ignore" in template_variables,
ignore=ignore,
)
)

Expand Down Expand Up @@ -204,14 +209,21 @@ def get_report_structure(
The profile report in HTML format
"""

collapse_warnings = config["warnings"]["collapse_if_more"].get(int)
if collapse_warnings == 0:
warnings = []
else:
warnings = summary["messages"]

sections = Sequence(
[
Dataset(
package=summary["package"],
date_start=date_start,
date_end=date_end,
values=summary["table"],
messages=summary["messages"],
messages=warnings,
collapse_warnings=len(warnings) > collapse_warnings,
variables=summary["variables"],
name="Overview",
anchor_id="overview",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def render_generic(summary):
return {
"top": Sequence([info, table, HTML("")], sequence_type="grid"),
"bottom": None,
"ignore": "ignore",
}

# Add class Ignore
Expand Down

0 comments on commit 609c029

Please sign in to comment.