Merge branch 'master' into rejected

ydataai · Jan 14, 2020 · e598cc5 · e598cc5
2 parents 5114df7 + 6084c69
commit e598cc5
Show file tree

Hide file tree

Showing 15 changed files with 25,130 additions and 36 deletions.
diff --git a/examples/census/census_report.html b/examples/census/census_report.html
diff --git a/examples/musical_instrument_reviews/review.py b/examples/musical_instrument_reviews/review.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+
+if __name__ == "__main__":
+    df = pd.read_json(
+        r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz",
+        compression="gzip",
+        lines=True,
+    )
+
+    profile = ProfileReport(
+        df, title="Amazon Musical Instrument Review | Profile Report"
+    )
+    profile.to_file(output_file=Path("./review_report.html"))
diff --git a/examples/musical_instrument_reviews/review_report.html b/examples/musical_instrument_reviews/review_report.html
diff --git a/src/pandas_profiling/__init__.py b/src/pandas_profiling/__init__.py
@@ -46,7 +46,7 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
             config.config.set_file(str(config_file))
         config.set_kwargs(kwargs)
 
-        self.date = datetime.utcnow()
+        self.date_start = datetime.utcnow()
 
         # Treat index as any other column
         if (
@@ -70,10 +70,14 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
 
         # Build report structure
         self.sample = self.get_sample(df)
-        self.report = get_report_structure(self.date, self.sample, description_set)
         self.title = config["title"].get(str)
         self.description_set = description_set
 
+        self.date_end = datetime.utcnow()
+        self.report = get_report_structure(
+            self.date_start, self.date_end, self.sample, description_set
+        )
+
     def sort_column_names(self, df):
         sort = config["sort"].get(str)
         if sys.version_info[1] <= 5 and sort != "None":

diff --git a/src/pandas_profiling/config_dark.yaml b/src/pandas_profiling/config_dark.yaml
@@ -7,18 +7,22 @@ pool_size: 0
 # Per variable type description settings
 vars:
     num:
-          quantiles:
-                - 0.05
-                - 0.25
-                - 0.5
-                - 0.75
-                - 0.95
-          skewness_threshold: 20
-          low_categorical_threshold: 5
+        quantiles:
+              - 0.05
+              - 0.25
+              - 0.5
+              - 0.75
+              - 0.95
+        skewness_threshold: 20
+        low_categorical_threshold: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.0
     cat:
         check_composition: True
         cardinality_threshold: 50
         n_obs: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.0
     bool:
         n_obs: 3
 

diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml
@@ -7,18 +7,22 @@ pool_size: 0
 # Per variable type description settings
 vars:
     num:
-          quantiles:
-                - 0.05
-                - 0.25
-                - 0.5
-                - 0.75
-                - 0.95
-          skewness_threshold: 20
-          low_categorical_threshold: 5
+        quantiles:
+              - 0.05
+              - 0.25
+              - 0.5
+              - 0.75
+              - 0.95
+        skewness_threshold: 20
+        low_categorical_threshold: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.999
     cat:
         check_composition: True
         cardinality_threshold: 50
         n_obs: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.999
     bool:
         n_obs: 3
 

diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml
@@ -7,18 +7,22 @@ pool_size: 0
 # Per variable type description settings
 vars:
     num:
-          quantiles:
-                - 0.05
-                - 0.25
-                - 0.5
-                - 0.75
-                - 0.95
-          skewness_threshold: 20
-          low_categorical_threshold: 5
+        quantiles:
+              - 0.05
+              - 0.25
+              - 0.5
+              - 0.75
+              - 0.95
+        skewness_threshold: 20
+        low_categorical_threshold: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.0
     cat:
         check_composition: False
         cardinality_threshold: 50
         n_obs: 5
+        # Set to zero to disable
+        chi_squared_threshold: 0.0
     bool:
         n_obs: 3
 

diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 from astropy.stats import bayesian_blocks
+from scipy.stats.stats import chisquare
 
 from pandas_profiling import __version__
 from pandas_profiling.config import config as config
@@ -73,6 +74,11 @@ def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict:
         "scatter_data": series,  # For complex
     }
 
+    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
+    if chi_squared_threshold > 0.0:
+        histogram = np.histogram(series[series.notna()].values, bins="auto")[0]
+        stats["chi_squared"] = chisquare(histogram)
+
     stats["range"] = stats["max"] - stats["min"]
     stats.update(
         {
@@ -142,6 +148,10 @@ def describe_categorical_1d(series: pd.Series, series_description: dict) -> dict
 
     stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}
 
+    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
+    if chi_squared_threshold > 0.0:
+        stats["chi_squared"] = list(chisquare(value_counts.values))
+
     check_composition = config["vars"]["cat"]["check_composition"].get(bool)
     if check_composition:
         contains = {

diff --git a/src/pandas_profiling/model/messages.py b/src/pandas_profiling/model/messages.py
@@ -60,6 +60,9 @@ class MessageType(Enum):
     REJECTED = 15
     """Variables are rejected if we do not want to consider them for further analysis."""
 
+    UNIFORM = 14
+    """The variable is uniformly distributed"""
+
 
 class Message(object):
     """A message object (type, values, column)."""
@@ -208,6 +211,15 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
                 Message(column_name=col, message_type=MessageType.TYPE_DATE, values={})
             )
 
+        # Uniformity
+        chi_squared_threshold = config["vars"]["cat"]["chi_squared_threshold"].get(
+            float
+        )
+        if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
+            messages.append(
+                Message(column_name=col, message_type=MessageType.UNIFORM, values={})
+            )
+
         # High cardinality
         if description["distinct_count"] > config["vars"]["cat"][
             "cardinality_threshold"
@@ -247,6 +259,16 @@ def check_variable_messages(col: str, description: dict) -> List[Message]:
                     fields={"skewness"},
                 )
             )
+
+        # Uniformity
+        chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
+            float
+        )
+        if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
+            messages.append(
+                Message(column_name=col, message_type=MessageType.UNIFORM, values={})
+            )
+
         # Zeros
         if warning_value(description["p_zeros"]):
             messages.append(

diff --git a/src/pandas_profiling/report/presentation/core/dataset.py b/src/pandas_profiling/report/presentation/core/dataset.py
@@ -5,12 +5,13 @@
 
 class Dataset(ItemRenderer):
     def __init__(
-        self, package, date, values, messages, collapse_warnings, variables, **kwargs
+        self, package, date_start, date_end, values, messages, collapse_warnings, variables, **kwargs
     ):
         super().__init__(
             "dataset",
             {
-                "date": date,
+                "date_start": date_start,
+                "date_end": date_end,
                 "values": values,
                 "messages": messages,
                 "variables": variables,

diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/diagram.html b/src/pandas_profiling/report/presentation/flavours/html/templates/diagram.html
@@ -1,5 +1,5 @@
 {% if image_format == 'svg' %}
-    {{- image }}
+    {{- image.replace('svg ','svg class="img-responsive center-img"') }}
 {% else %}
     <img class="img-responsive center-img" src="{{ image }}" alt="{{ alt }}">
 {% endif %}

diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/diagram_small.html b/src/pandas_profiling/report/presentation/flavours/html/templates/diagram_small.html
diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/overview/overview.html b/src/pandas_profiling/report/presentation/flavours/html/templates/overview/overview.html
@@ -61,8 +61,12 @@
             <table class="table table-condensed stats">
                 <tbody>
                 <tr>
-                    <th>Date of analysis</th>
-                    <td>{{ date }}</td>
+                    <th>Analysis started</th>
+                    <td>{{ date_start }}</td>
+                </tr>
+                <tr>
+                    <th>Analysis finished</th>
+                    <td>{{ date_end }}</td>
                 </tr>
                 <tr>
                     <th>Version</th>

diff --git a/...ofiling/report/presentation/flavours/html/templates/overview/warnings/warning_unique.html b/...ofiling/report/presentation/flavours/html/templates/overview/warnings/warning_unique.html
diff --git a/src/pandas_profiling/report/structure/report.py b/src/pandas_profiling/report/structure/report.py
@@ -1,4 +1,5 @@
 """Generate the report."""
+from datetime import datetime
 
 import pandas_profiling.visualisation.plot as plot
 from pandas_profiling.config import config
@@ -195,7 +196,9 @@ def get_sample_items(sample: dict):
     return items
 
 
-def get_report_structure(date, sample: dict, summary: dict) -> Renderable:
+def get_report_structure(
+    date_start: datetime, date_end: datetime, sample: dict, summary: dict
+) -> Renderable:
     """Generate a HTML report from summary statistics and a given sample.
 
     Args:
@@ -216,7 +219,8 @@ def get_report_structure(date, sample: dict, summary: dict) -> Renderable:
         [
             Dataset(
                 package=summary["package"],
-                date=date,
+                date_start=date_start,
+                date_end=date_end,
                 values=summary["table"],
                 messages=warnings,
                 collapse_warnings=len(warnings) > collapse_warnings,