add examples to all public classes/methods (#115)

* add examples to all public classes/methods - use `pip install -e .` for dev install instructions * fix tests * fix test warnings * fix docstring tests * improve series schema test coverage * fix docstring
unionai-oss · Sep 29, 2019 · 4eeeb16 · 4eeeb16
1 parent 3ae8aab
commit 4eeeb16
Show file tree

Hide file tree

Showing 11 changed files with 582 additions and 138 deletions.
diff --git a/README.md b/README.md
@@ -101,7 +101,8 @@ print(validated_df)
 ```
 git clone https://github.com/pandera-dev/pandera.git
 cd pandera
-pip install -r requirements.txt && python setup.py install
+pip install -r requirements.txt
+pip install -e .
 ```
 
 ## Tests

diff --git a/docs/source/API.rst b/docs/source/API.rst
@@ -3,62 +3,62 @@
 API
 ===
 
-``pandera.schemas``
--------------------
+pandera.schemas
+---------------
 
 .. automodule:: pandera.schemas
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.schema_components``
------------------------------
+pandera.schema_components
+-------------------------
 
 .. automodule:: pandera.schema_components
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.checks``
-------------------
+pandera.checks
+--------------
 
 .. automodule:: pandera.checks
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.hypotheses``
-----------------------
+pandera.hypotheses
+------------------
 
 .. automodule:: pandera.hypotheses
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.decorators``
-----------------------
+pandera.decorators
+------------------
 
 .. automodule:: pandera.decorators
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.dtypes``
-------------------
+pandera.dtypes
+--------------
 
 .. automodule:: pandera.dtypes
    :members:
    :undoc-members:
    :show-inheritance:
 
 
-``pandera.errors``
-------------------
+pandera.errors
+--------------
 
 .. automodule:: pandera.errors
    :members:

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -11,6 +11,7 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
+import doctest
 import sys
 sys.path.insert(0, os.path.abspath('../../pandera'))
 
@@ -36,6 +37,14 @@
 doctest_global_setup = """
 """
 
+doctest_default_flags = (
+    0
+    | doctest.DONT_ACCEPT_TRUE_FOR_1
+    | doctest.ELLIPSIS
+    | doctest.IGNORE_EXCEPTION_DETAIL
+    | doctest.NORMALIZE_WHITESPACE
+)
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 

diff --git a/docs/source/dataframe_schemas.rst b/docs/source/dataframe_schemas.rst
@@ -358,6 +358,7 @@ for each level in the index hierarchy:
     1   2   b
     2   3   c
 
+.. _multiindex:
 
 MultiIndex Indexes
 ~~~~~~~~~~~~~~~~~~

diff --git a/docs/source/series_schemas.rst b/docs/source/series_schemas.rst
@@ -1,5 +1,7 @@
 .. pandera documentation for seriesschemas
 
+.. _SeriesSchemas:
+
 Series Schemas
 ==============
 

diff --git a/pandera/checks.py b/pandera/checks.py
@@ -3,55 +3,122 @@
 import pandas as pd
 
 from functools import partial
-from typing import Union, Optional, List, Dict
+from typing import Any, Union, Optional, List, Dict, Callable
 
 from . import errors, constants
 from .dtypes import PandasDtype
 
 
+CheckCallable = Callable[
+    [Union[pd.Series, Any]], Union[pd.Series, bool]
+]
+
+
 class Check(object):
 
     def __init__(
             self,
-            fn: callable,
-            groups: Union[str, List[str], None] = None,
-            groupby: Union[str, List[str], callable, None] = None,
-            element_wise: Union[bool, List[bool]] = False,
+            fn: Callable,
+            groups: Optional[Union[str, List[str]]] = None,
+            groupby: Optional[Union[str, List[str], Callable]] = None,
+            element_wise: bool = False,
             error: Optional[str] = None,
             n_failure_cases: Optional[int] = constants.N_FAILURE_CASES):
-        """Check object applies function element-wise or series-wise
-
-        :param callable fn: A function to check series schema. If element_wise
-            is True, then callable signature should be: x -> bool where x is a
-            scalar element in the column. Otherwise, signature is expected
-            to be: pd.Series -> bool|pd.Series[bool].
+        """Apply a validation function to each element, Series, or DataFrame.
+
+        :param fn: A function to check pandas data structure. For Column
+            or SeriesSchema checks, if element_wise is True, this function
+            should have the signature: ``Callable[[pd.Series],
+            Union[pd.Series, bool]]``, where the output series is a boolean
+            vector.
+
+            If element_wise is False, this function should have the signature:
+            ``Callable[[Any], bool]``, where ``Any`` is an element in the
+            column.
+
+            For DataFrameSchema checks, if element_wise=True, fn
+            should have the signature: ``Callable[[pd.DataFrame],
+            Union[pd.DataFrame, pd.Series, bool]]``, where the output dataframe
+            or series contains booleans.
+
+            If element_wise is True, fn is applied to each row in
+            the dataframe with the signature ``Callable[[pd.Series], bool]``
+            where the series input is a row in the dataframe.
         :param groups: The dict input to the `fn` callable will be constrained
             to the groups specified by `groups`.
-        :type groups: str|list[str]|None
-        :param groupby: Only applies to Column Checks. If a string or list of
-            strings is provided, then these columns are used to group the
-            Column Series by `groupby`. If a callable is passed, the expected
-            signature is DataFrame -> DataFrameGroupby. The function has access
-            to the entire dataframe, but the Column.name is selected from this
-            DataFrameGroupby object so that a SeriesGroupBy object is passed
-            into `fn`.
+        :param groupby: If a string or list of strings is provided, these
+            columns are used to group the Column series. If a
+            callable is passed, the expected signature is: ``Callable[
+            [pd.DataFrame], pd.core.groupby.DataFrameGroupBy]``
 
-            Specifying this argument changes the `fn` signature to:
-
-            dict[str|tuple[str], Series] -> bool|pd.Series[bool]
+            The the case of ``Column`` checks, this function has access to the
+            entire dataframe, but ``Column.name`` is selected from this
+            DataFrameGroupby object so that a SeriesGroupBy object is passed
+            into ``fn``.
 
-            Where specific groups can be obtained from the input dict.
-        :type groupby: str|list[str]|callable|None
+            Specifying the groupby argument changes the ``fn`` signature to: ``
+            Callable[[Dict[Union[str, Tuple[str]], pd.Series]],
+            Union[bool, pd.Series]]``, where the input is a dictionary mapping
+            keys to subsets of the column/dataframe.
         :param element_wise: Whether or not to apply validator in an
             element-wise fashion. If bool, assumes that all checks should be
             applied to the column element-wise. If list, should be the same
             number of elements as checks.
-        :type element_wise: bool|list[bool]
-        :param str error: custom error message if series fails validation
+        :param error: custom error message if series fails validation
             check.
-        :type str error:
         :param n_failure_cases: report the top n failure cases. If None, then
             report all failure cases.
+
+        :example:
+
+        >>> import pandas as pd
+        >>> import pandera as pa
+        >>> from pandera import Column, Check, DataFrameSchema
+        >>>
+        >>> # column checks are vectorized by default
+        >>> check_positive = Check(lambda s: s > 0)
+        >>>
+        >>> # define an element-wise check
+        >>> check_even = Check(lambda x: x % 2 == 0, element_wise=True)
+        >>>
+        >>> # specify assertions across categorical variables using `groupby`,
+        >>> # for example, make sure the mean measure for group "A" is always
+        >>> # larger than the mean measure for group "B"
+        >>> check_by_group = Check(
+        ...     lambda measures: measures["A"].mean() > measures["B"].mean(),
+        ...     groupby=["group"],
+        ... )
+        >>>
+        >>> # define a wide DataFrame-level check
+        >>> check_dataframe = Check(
+        ...     lambda df: df["measure_1"] > df["measure_2"])
+        >>>
+        >>> measure_checks = [check_positive, check_even, check_by_group]
+        >>>
+        >>> schema = DataFrameSchema(
+        ...     columns={
+        ...         "measure_1": Column(pa.Int, checks=measure_checks),
+        ...         "measure_2": Column(pa.Int, checks=measure_checks),
+        ...         "group": Column(pa.String),
+        ...     },
+        ...     checks=check_dataframe
+        ... )
+        >>>
+        >>> df = pd.DataFrame({
+        ...     "measure_1": [10, 12, 14, 16],
+        ...     "measure_2": [2, 4, 6, 8],
+        ...     "group": ["B", "B", "A", "A"]
+        ... })
+        >>>
+        >>> schema.validate(df)[["measure_1", "measure_2", "group"]]
+           measure_1  measure_2 group
+        0         10          2     B
+        1         12          4     B
+        2         14          6     A
+        3         16          8     A
+
+        See :ref:`here<checks>` for more usage details.
+
         """
         if element_wise and groupby is not None:
             raise errors.SchemaInitError(
@@ -177,7 +244,7 @@ def _format_input(
             if group_key in groups
         }
 
-    def prepare_series_input(
+    def _prepare_series_input(
             self,
             series: pd.Series,
             dataframe_context: pd.DataFrame) -> Dict[str, pd.Series]:
@@ -206,7 +273,7 @@ def prepare_series_input(
 
         return self._format_input(groupby_obj, self.groups)
 
-    def prepare_dataframe_input(
+    def _prepare_dataframe_input(
             self, dataframe: pd.DataFrame) -> Dict[str, pd.DataFrame]:
         """Prepare input for DataFrameSchema check."""
         if self.groupby is None:

diff --git a/pandera/decorators.py b/pandera/decorators.py
@@ -51,6 +51,40 @@ def check_input(
     :type tail: int
     :param sample: validate a random sample of n rows. Rows overlapping
         with `head` or `tail` are de-duplicated.
+
+    :example:
+
+    Check the input of a decorated function.
+
+    >>> import pandas as pd
+    >>> import pandera as pa
+    >>>
+    >>> from pandera import DataFrameSchema, Column
+    >>>
+    >>>
+    >>> schema = DataFrameSchema({
+    ...     "column": Column(pa.Int),
+    ... })
+    >>>
+    >>> @pa.check_input(schema)
+    ... def transform_data(df: pd.DataFrame) -> pd.DataFrame:
+    ...     df["doubled_column"] = df["column"] * 2
+    ...     return df
+    >>>
+    >>> df = pd.DataFrame({
+    ...     "column": range(5),
+    ... })
+    >>>
+    >>> transform_data(df)
+       column  doubled_column
+    0       0               0
+    1       1               2
+    2       2               4
+    3       3               6
+    4       4               8
+
+    See :ref:`here<decorators>` for more usage details.
+
     """
 
     @wrapt.decorator
@@ -129,6 +163,40 @@ def check_output(
     :type tail: int
     :param sample: validate a random sample of n rows. Rows overlapping
         with `head` or `tail` are de-duplicated.
+
+    :example:
+
+    Check the output a decorated function.
+
+    >>> import pandas as pd
+    >>> import pandera as pa
+    >>>
+    >>> from pandera import DataFrameSchema, Column, Check
+    >>>
+    >>>
+    >>> schema = DataFrameSchema(
+    ...     columns={
+    ...         "doubled_column": Column(pa.Int),
+    ...     },
+    ...     checks=Check(lambda df: df["doubled_column"] == df["column"] * 2)
+    ... )
+    >>>
+    >>> @pa.check_output(schema)
+    ... def transform_data(df: pd.DataFrame) -> pd.DataFrame:
+    ...     df["doubled_column"] = df["column"] * 2
+    ...     return df
+    >>>
+    >>> df = pd.DataFrame({"column": range(5)})
+    >>>
+    >>> transform_data(df)
+       column  doubled_column
+    0       0               0
+    1       1               2
+    2       2               4
+    3       3               6
+    4       4               8
+
+    See :ref:`here<decorators>` for more usage details.
     """
 
     @wrapt.decorator
-Original file line number
+Diff line change
@@ Expand Up / @@ -358,6 +358,7 @@ for each level in the index hierarchy: @@
 2   b
 3   c
+    .. _multiindex:
     MultiIndex Indexes
     ~~~~~~~~~~~~~~~~~~
@@ Expand Down @@