Skip to content

Commit

Permalink
Merge branch 'master' into rejected
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman authored Jan 14, 2020
2 parents 5114df7 + 6084c69 commit e598cc5
Show file tree
Hide file tree
Showing 15 changed files with 25,130 additions and 36 deletions.
16,269 changes: 16,267 additions & 2 deletions examples/census/census_report.html

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions examples/musical_instrument_reviews/review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport

if __name__ == "__main__":
df = pd.read_json(
r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz",
compression="gzip",
lines=True,
)

profile = ProfileReport(
df, title="Amazon Musical Instrument Review | Profile Report"
)
profile.to_file(output_file=Path("./review_report.html"))
8,756 changes: 8,756 additions & 0 deletions examples/musical_instrument_reviews/review_report.html

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
config.config.set_file(str(config_file))
config.set_kwargs(kwargs)

self.date = datetime.utcnow()
self.date_start = datetime.utcnow()

# Treat index as any other column
if (
Expand All @@ -70,10 +70,14 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):

# Build report structure
self.sample = self.get_sample(df)
self.report = get_report_structure(self.date, self.sample, description_set)
self.title = config["title"].get(str)
self.description_set = description_set

self.date_end = datetime.utcnow()
self.report = get_report_structure(
self.date_start, self.date_end, self.sample, description_set
)

def sort_column_names(self, df):
sort = config["sort"].get(str)
if sys.version_info[1] <= 5 and sort != "None":
Expand Down
20 changes: 12 additions & 8 deletions src/pandas_profiling/config_dark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@ pool_size: 0
# Per variable type description settings
vars:
num:
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
# Set to zero to disable
chi_squared_threshold: 0.0
cat:
check_composition: True
cardinality_threshold: 50
n_obs: 5
# Set to zero to disable
chi_squared_threshold: 0.0
bool:
n_obs: 3

Expand Down
20 changes: 12 additions & 8 deletions src/pandas_profiling/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@ pool_size: 0
# Per variable type description settings
vars:
num:
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
# Set to zero to disable
chi_squared_threshold: 0.999
cat:
check_composition: True
cardinality_threshold: 50
n_obs: 5
# Set to zero to disable
chi_squared_threshold: 0.999
bool:
n_obs: 3

Expand Down
20 changes: 12 additions & 8 deletions src/pandas_profiling/config_minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@ pool_size: 0
# Per variable type description settings
vars:
num:
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
quantiles:
- 0.05
- 0.25
- 0.5
- 0.75
- 0.95
skewness_threshold: 20
low_categorical_threshold: 5
# Set to zero to disable
chi_squared_threshold: 0.0
cat:
check_composition: False
cardinality_threshold: 50
n_obs: 5
# Set to zero to disable
chi_squared_threshold: 0.0
bool:
n_obs: 3

Expand Down
10 changes: 10 additions & 0 deletions src/pandas_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
from astropy.stats import bayesian_blocks
from scipy.stats.stats import chisquare

from pandas_profiling import __version__
from pandas_profiling.config import config as config
Expand Down Expand Up @@ -73,6 +74,11 @@ def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict:
"scatter_data": series, # For complex
}

chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
if chi_squared_threshold > 0.0:
histogram = np.histogram(series[series.notna()].values, bins="auto")[0]
stats["chi_squared"] = chisquare(histogram)

stats["range"] = stats["max"] - stats["min"]
stats.update(
{
Expand Down Expand Up @@ -142,6 +148,10 @@ def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict

stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
if chi_squared_threshold > 0.0:
stats["chi_squared"] = list(chisquare(value_counts.values))

check_composition = config["vars"]["cat"]["check_composition"].get(bool)
if check_composition:
contains = {
Expand Down
22 changes: 22 additions & 0 deletions src/pandas_profiling/model/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class MessageType(Enum):
REJECTED = 15
"""Variables are rejected if we do not want to consider them for further analysis."""

UNIFORM = 14
"""The variable is uniformly distributed"""


class Message(object):
"""A message object (type, values, column)."""
Expand Down Expand Up @@ -208,6 +211,15 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
Message(column_name=col, message_type=MessageType.TYPE_DATE, values={})
)

# Uniformity
chi_squared_threshold = config["vars"]["cat"]["chi_squared_threshold"].get(
float
)
if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
messages.append(
Message(column_name=col, message_type=MessageType.UNIFORM, values={})
)

# High cardinality
if description["distinct_count"] > config["vars"]["cat"][
"cardinality_threshold"
Expand Down Expand Up @@ -247,6 +259,16 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
fields={"skewness"},
)
)

# Uniformity
chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
float
)
if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
messages.append(
Message(column_name=col, message_type=MessageType.UNIFORM, values={})
)

# Zeros
if warning_value(description["p_zeros"]):
messages.append(
Expand Down
5 changes: 3 additions & 2 deletions src/pandas_profiling/report/presentation/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

class Dataset(ItemRenderer):
def __init__(
self, package, date, values, messages, collapse_warnings, variables, **kwargs
self, package, date_start, date_end, values, messages, collapse_warnings, variables, **kwargs
):
super().__init__(
"dataset",
{
"date": date,
"date_start": date_start,
"date_end": date_end,
"values": values,
"messages": messages,
"variables": variables,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% if image_format == 'svg' %}
{{- image }}
{{- image.replace('svg ','svg class="img-responsive center-img"') }}
{% else %}
<img class="img-responsive center-img" src="{{ image }}" alt="{{ alt }}">
{% endif %}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,12 @@
<table class="table table-condensed stats">
<tbody>
<tr>
<th>Date of analysis</th>
<td>{{ date }}</td>
<th>Analysis started</th>
<td>{{ date_start }}</td>
</tr>
<tr>
<th>Analysis finished</th>
<td>{{ date_end }}</td>
</tr>
<tr>
<th>Version</th>
Expand Down
Empty file.
8 changes: 6 additions & 2 deletions src/pandas_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Generate the report."""
from datetime import datetime

import pandas_profiling.visualisation.plot as plot
from pandas_profiling.config import config
Expand Down Expand Up @@ -195,7 +196,9 @@ def get_sample_items(sample: dict):
return items


def get_report_structure(date, sample: dict, summary: dict) -> Renderable:
def get_report_structure(
date_start: datetime, date_end: datetime, sample: dict, summary: dict
) -> Renderable:
"""Generate a HTML report from summary statistics and a given sample.
Args:
Expand All @@ -216,7 +219,8 @@ def get_report_structure(date, sample: dict, summary: dict) -> Renderable:
[
Dataset(
package=summary["package"],
date=date,
date_start=date_start,
date_end=date_end,
values=summary["table"],
messages=warnings,
collapse_warnings=len(warnings) > collapse_warnings,
Expand Down

0 comments on commit e598cc5

Please sign in to comment.