Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rejected #333

Merged
merged 12 commits into from
Jan 14, 2020
7 changes: 7 additions & 0 deletions src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ def get_description(self) -> dict:
"""
return self.description_set

def get_rejected_variables() -> list:
return [
message.column_name
for message in self.description_set["messages"]
if message.message_type == MessageType.REJECTED
]

def to_file(self, output_file: Path, silent: bool = True) -> None:
"""Write the report to a file.

Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_dark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
9 changes: 9 additions & 0 deletions src/pandas_profiling/config_minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,19 @@ n_obs_unique: 5
n_extreme_obs: 5
n_freq_table_max: 10

# Configuration related to the samples area
samples:
head: 10
tail: 10

# Configuration related to the warning overview (top) and per variable warnings
warnings:
# Set to zero to disable showing warnings
collapse_if_more: 20

# Configuration related to the rejection of variables
reject_variables: True

# When in a Jupyter notebook
notebook:
iframe:
Expand Down
12 changes: 0 additions & 12 deletions src/pandas_profiling/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,9 @@ class Variable(Enum):

TYPE_COMPLEX = "COMPLEX"

S_TYPE_CONST = "CONST"
"""A constant variable"""

S_TYPE_UNIQUE = "UNIQUE"
"""An unique variable"""

S_TYPE_UNSUPPORTED = "UNSUPPORTED"
"""An unsupported variable"""

S_TYPE_CORR = "CORR"
"""A highly correlated variable"""

S_TYPE_REJECTED = "REJECTED"
"""A rejected variable"""


# Temporary mapping
Boolean = Variable.TYPE_BOOL
Expand Down
56 changes: 53 additions & 3 deletions src/pandas_profiling/model/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class MessageType(Enum):
"""Message Types"""

CONST = 1
CONSTANT = 1
"""This variable has a constant value."""

ZEROS = 2
Expand Down Expand Up @@ -52,6 +52,13 @@ class MessageType(Enum):
"""This variable is likely a datetime, but treated as categorical."""

UNIQUE = 12
"""This variable has unique values."""

CONSTANT_LENGTH = 13
"""This variable has a constant length"""

REJECTED = 15
"""Variables are rejected if we do not want to consider them for further analysis."""

UNIFORM = 14
"""The variable is uniformly distributed"""
Expand All @@ -76,6 +83,15 @@ def __init__(
self.column_name = column_name
self.anchor_id = hash(column_name)

def fmt(self):
# TODO: render in template
name = self.message_type.name.replace("_", " ")
if name == "HIGH CORRELATION":
name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format(
num=len(self.values["fields"]), title=", ".join(self.values["fields"])
)
return name

def __repr__(self):
return "[{message_type}] warning on column {column}".format(
message_type=self.message_type.name, column=self.column_name
Expand Down Expand Up @@ -141,12 +157,21 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
messages.append(
Message(
column_name=col,
message_type=MessageType.CONST,
message_type=MessageType.CONSTANT,
values=description,
fields={"n_unique"},
)
)

messages.append(
Message(
column_name=col,
message_type=MessageType.REJECTED,
values=description,
fields={},
)
)

if description["distinct_count_without_nan"] == description["n"]:
messages.append(
Message(
Expand All @@ -168,8 +193,19 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
)
)

# Unsupported
if description["type"] == Variable.S_TYPE_UNSUPPORTED:
messages.append(
Message(
column_name=col,
message_type=MessageType.REJECTED,
values=description,
fields={},
)
)

# Categorical
if description["type"] in {Variable.TYPE_CAT}:
if description["type"] == Variable.TYPE_CAT:
if description["date_warning"]:
messages.append(
Message(column_name=col, message_type=MessageType.TYPE_DATE, values={})
Expand Down Expand Up @@ -197,6 +233,20 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
)
)

# Constant length
if (
"composition" in description
and description["min_length"] == description["max_length"]
):
messages.append(
Message(
column_name=col,
message_type=MessageType.CONSTANT_LENGTH,
values=description,
fields={"composition_min_length", "composition_max_length"},
)
)

# Numerical
if description["type"] == Variable.TYPE_NUM:
# Skewness
Expand Down
11 changes: 10 additions & 1 deletion src/pandas_profiling/report/presentation/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@

class Dataset(ItemRenderer):
def __init__(
self, package, date_start, date_end, values, messages, variables, **kwargs
self,
package,
date_start,
date_end,
values,
messages,
collapse_warnings,
variables,
**kwargs
):
super().__init__(
"dataset",
Expand All @@ -15,6 +23,7 @@ def __init__(
"values": values,
"messages": messages,
"variables": variables,
"collapse_warnings": collapse_warnings,
"package": package,
},
**kwargs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
</a>
</div>

<div id="reproduction" class="row collapse">
<div id="reproduction" class="row collapse" aria-expanded="false">
<div class="col-sm-12">
<p class="h2">Reproduction info</p>
<table class="table table-condensed stats">
Expand Down Expand Up @@ -88,13 +88,13 @@
{% if messages %}
<div class="col-sm-12 text-right">
<a role="button" data-toggle="collapse" data-target="#warnings"
aria-expanded="true" aria-controls="collapseExample" class="">
aria-expanded="{% if collapse_warnings %}false{% else %}true{% endif %}" aria-controls="collapseExample" class="">
Toggle Warnings
</a>
</div>

<div id="warnings" class="row collapse in" aria-expanded="true">
<div class="col-sm-12" style="padding-left: 1em;">
<div id="warnings" class="row collapse{% if not collapse_warnings %} in{% endif %}" aria-expanded="{% if collapse_warnings %}false{% else %}true{% endif %}">
<div class="col-sm-12" style="padding-left: 1em;}">
<p class="h2">Warnings</p>
<table class="table table-condensed list-warnings">
{% for message in messages %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
<a class="anchor" href="#pp_var_{{ message.anchor_id }}"><code>{{ message.column_name }}</code></a> is an unsupported type, check if it needs cleaning or further analysis
</td>
<td>
<span class="label label-warning">Warning</span>
<span class="label label-warning">Rejected</span>
</td>
44 changes: 28 additions & 16 deletions src/pandas_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
ImagePath,
Generic,
)
from pandas_profiling.model.messages import MessageType
from pandas_profiling.report.structure.variables import (
render_boolean,
render_categorical,
Expand Down Expand Up @@ -122,47 +123,51 @@ def render_variables_section(dataframe_summary: dict) -> list:
templs = []

for idx, summary in dataframe_summary["variables"].items():
# TODO: move to render
# Common template variables
def fmt_warning(warning):
name = warning.message_type.name.replace("_", " ")
if name == "HIGH CORRELATION":
name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format(
num=len(warning.values["fields"]),
title=", ".join(warning.values["fields"]),
)
return name

warnings = [
fmt_warning(warning)
warning.fmt()
for warning in dataframe_summary["messages"]
if warning.column_name == idx
]
warn_fields = [

warning_fields = {
field
for warning in dataframe_summary["messages"]
if warning.column_name == idx
for field in warning.fields
]
}

warning_types = {
warning.message_type
for warning in dataframe_summary["messages"]
if warning.column_name == idx
}

template_variables = {
"varname": idx,
"varid": hash(idx),
"warnings": warnings,
"warn_fields": warn_fields,
"warn_fields": warning_fields,
}

template_variables.update(summary)

# Per type template variables
template_variables.update(type_to_func[summary["type"]](template_variables))

# Ignore these
if config["reject_variables"].get(bool):
ignore = MessageType.REJECTED in warning_types
else:
ignore = False

templs.append(
Preview(
template_variables["top"],
template_variables["bottom"],
anchor_id=template_variables["varid"],
name=idx,
ignore="ignore" in template_variables,
ignore=ignore,
)
)

Expand Down Expand Up @@ -204,14 +209,21 @@ def get_report_structure(
The profile report in HTML format
"""

collapse_warnings = config["warnings"]["collapse_if_more"].get(int)
if collapse_warnings == 0:
warnings = []
else:
warnings = summary["messages"]

sections = Sequence(
[
Dataset(
package=summary["package"],
date_start=date_start,
date_end=date_end,
values=summary["table"],
messages=summary["messages"],
messages=warnings,
collapse_warnings=len(warnings) > collapse_warnings,
variables=summary["variables"],
name="Overview",
anchor_id="overview",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def render_generic(summary):
return {
"top": Sequence([info, table, HTML("")], sequence_type="grid"),
"bottom": None,
"ignore": "ignore",
}

# Add class Ignore
Expand Down