Skip to content

Commit

Permalink
add examples to all public classes/methods (#115)
Browse files Browse the repository at this point in the history
* add examples to all public classes/methods

- use `pip install -e .` for dev install instructions

* fix tests

* fix test warnings

* fix docstring tests

* improve series schema test coverage

* fix docstring
  • Loading branch information
cosmicBboy authored Sep 29, 2019
1 parent 3ae8aab commit 4eeeb16
Show file tree
Hide file tree
Showing 11 changed files with 582 additions and 138 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ print(validated_df)
```
git clone https://github.com/pandera-dev/pandera.git
cd pandera
pip install -r requirements.txt && python setup.py install
pip install -r requirements.txt
pip install -e .
```

## Tests
Expand Down
28 changes: 14 additions & 14 deletions docs/source/API.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,62 @@
API
===

``pandera.schemas``
-------------------
pandera.schemas
---------------

.. automodule:: pandera.schemas
:members:
:undoc-members:
:show-inheritance:


``pandera.schema_components``
-----------------------------
pandera.schema_components
-------------------------

.. automodule:: pandera.schema_components
:members:
:undoc-members:
:show-inheritance:


``pandera.checks``
------------------
pandera.checks
--------------

.. automodule:: pandera.checks
:members:
:undoc-members:
:show-inheritance:


``pandera.hypotheses``
----------------------
pandera.hypotheses
------------------

.. automodule:: pandera.hypotheses
:members:
:undoc-members:
:show-inheritance:


``pandera.decorators``
----------------------
pandera.decorators
------------------

.. automodule:: pandera.decorators
:members:
:undoc-members:
:show-inheritance:


``pandera.dtypes``
------------------
pandera.dtypes
--------------

.. automodule:: pandera.dtypes
:members:
:undoc-members:
:show-inheritance:


``pandera.errors``
------------------
pandera.errors
--------------

.. automodule:: pandera.errors
:members:
Expand Down
9 changes: 9 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import doctest
import sys
sys.path.insert(0, os.path.abspath('../../pandera'))

Expand All @@ -36,6 +37,14 @@
doctest_global_setup = """
"""

doctest_default_flags = (
0
| doctest.DONT_ACCEPT_TRUE_FOR_1
| doctest.ELLIPSIS
| doctest.IGNORE_EXCEPTION_DETAIL
| doctest.NORMALIZE_WHITESPACE
)

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

Expand Down
1 change: 1 addition & 0 deletions docs/source/dataframe_schemas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ for each level in the index hierarchy:
1 2 b
2 3 c

.. _multiindex:

MultiIndex Indexes
~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions docs/source/series_schemas.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.. pandera documentation for seriesschemas
.. _SeriesSchemas:

Series Schemas
==============

Expand Down
125 changes: 96 additions & 29 deletions pandera/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,122 @@
import pandas as pd

from functools import partial
from typing import Union, Optional, List, Dict
from typing import Any, Union, Optional, List, Dict, Callable

from . import errors, constants
from .dtypes import PandasDtype


CheckCallable = Callable[
[Union[pd.Series, Any]], Union[pd.Series, bool]
]


class Check(object):

def __init__(
self,
fn: callable,
groups: Union[str, List[str], None] = None,
groupby: Union[str, List[str], callable, None] = None,
element_wise: Union[bool, List[bool]] = False,
fn: Callable,
groups: Optional[Union[str, List[str]]] = None,
groupby: Optional[Union[str, List[str], Callable]] = None,
element_wise: bool = False,
error: Optional[str] = None,
n_failure_cases: Optional[int] = constants.N_FAILURE_CASES):
"""Check object applies function element-wise or series-wise
:param callable fn: A function to check series schema. If element_wise
is True, then callable signature should be: x -> bool where x is a
scalar element in the column. Otherwise, signature is expected
to be: pd.Series -> bool|pd.Series[bool].
"""Apply a validation function to each element, Series, or DataFrame.
:param fn: A function to check pandas data structure. For Column
or SeriesSchema checks, if element_wise is True, this function
should have the signature: ``Callable[[pd.Series],
Union[pd.Series, bool]]``, where the output series is a boolean
vector.
If element_wise is False, this function should have the signature:
``Callable[[Any], bool]``, where ``Any`` is an element in the
column.
For DataFrameSchema checks, if element_wise=True, fn
should have the signature: ``Callable[[pd.DataFrame],
Union[pd.DataFrame, pd.Series, bool]]``, where the output dataframe
or series contains booleans.
If element_wise is True, fn is applied to each row in
the dataframe with the signature ``Callable[[pd.Series], bool]``
where the series input is a row in the dataframe.
:param groups: The dict input to the `fn` callable will be constrained
to the groups specified by `groups`.
:type groups: str|list[str]|None
:param groupby: Only applies to Column Checks. If a string or list of
strings is provided, then these columns are used to group the
Column Series by `groupby`. If a callable is passed, the expected
signature is DataFrame -> DataFrameGroupby. The function has access
to the entire dataframe, but the Column.name is selected from this
DataFrameGroupby object so that a SeriesGroupBy object is passed
into `fn`.
:param groupby: If a string or list of strings is provided, these
columns are used to group the Column series. If a
callable is passed, the expected signature is: ``Callable[
[pd.DataFrame], pd.core.groupby.DataFrameGroupBy]``
Specifying this argument changes the `fn` signature to:
dict[str|tuple[str], Series] -> bool|pd.Series[bool]
The the case of ``Column`` checks, this function has access to the
entire dataframe, but ``Column.name`` is selected from this
DataFrameGroupby object so that a SeriesGroupBy object is passed
into ``fn``.
Where specific groups can be obtained from the input dict.
:type groupby: str|list[str]|callable|None
Specifying the groupby argument changes the ``fn`` signature to: ``
Callable[[Dict[Union[str, Tuple[str]], pd.Series]],
Union[bool, pd.Series]]``, where the input is a dictionary mapping
keys to subsets of the column/dataframe.
:param element_wise: Whether or not to apply validator in an
element-wise fashion. If bool, assumes that all checks should be
applied to the column element-wise. If list, should be the same
number of elements as checks.
:type element_wise: bool|list[bool]
:param str error: custom error message if series fails validation
:param error: custom error message if series fails validation
check.
:type str error:
:param n_failure_cases: report the top n failure cases. If None, then
report all failure cases.
:example:
>>> import pandas as pd
>>> import pandera as pa
>>> from pandera import Column, Check, DataFrameSchema
>>>
>>> # column checks are vectorized by default
>>> check_positive = Check(lambda s: s > 0)
>>>
>>> # define an element-wise check
>>> check_even = Check(lambda x: x % 2 == 0, element_wise=True)
>>>
>>> # specify assertions across categorical variables using `groupby`,
>>> # for example, make sure the mean measure for group "A" is always
>>> # larger than the mean measure for group "B"
>>> check_by_group = Check(
... lambda measures: measures["A"].mean() > measures["B"].mean(),
... groupby=["group"],
... )
>>>
>>> # define a wide DataFrame-level check
>>> check_dataframe = Check(
... lambda df: df["measure_1"] > df["measure_2"])
>>>
>>> measure_checks = [check_positive, check_even, check_by_group]
>>>
>>> schema = DataFrameSchema(
... columns={
... "measure_1": Column(pa.Int, checks=measure_checks),
... "measure_2": Column(pa.Int, checks=measure_checks),
... "group": Column(pa.String),
... },
... checks=check_dataframe
... )
>>>
>>> df = pd.DataFrame({
... "measure_1": [10, 12, 14, 16],
... "measure_2": [2, 4, 6, 8],
... "group": ["B", "B", "A", "A"]
... })
>>>
>>> schema.validate(df)[["measure_1", "measure_2", "group"]]
measure_1 measure_2 group
0 10 2 B
1 12 4 B
2 14 6 A
3 16 8 A
See :ref:`here<checks>` for more usage details.
"""
if element_wise and groupby is not None:
raise errors.SchemaInitError(
Expand Down Expand Up @@ -177,7 +244,7 @@ def _format_input(
if group_key in groups
}

def prepare_series_input(
def _prepare_series_input(
self,
series: pd.Series,
dataframe_context: pd.DataFrame) -> Dict[str, pd.Series]:
Expand Down Expand Up @@ -206,7 +273,7 @@ def prepare_series_input(

return self._format_input(groupby_obj, self.groups)

def prepare_dataframe_input(
def _prepare_dataframe_input(
self, dataframe: pd.DataFrame) -> Dict[str, pd.DataFrame]:
"""Prepare input for DataFrameSchema check."""
if self.groupby is None:
Expand Down
68 changes: 68 additions & 0 deletions pandera/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,40 @@ def check_input(
:type tail: int
:param sample: validate a random sample of n rows. Rows overlapping
with `head` or `tail` are de-duplicated.
:example:
Check the input of a decorated function.
>>> import pandas as pd
>>> import pandera as pa
>>>
>>> from pandera import DataFrameSchema, Column
>>>
>>>
>>> schema = DataFrameSchema({
... "column": Column(pa.Int),
... })
>>>
>>> @pa.check_input(schema)
... def transform_data(df: pd.DataFrame) -> pd.DataFrame:
... df["doubled_column"] = df["column"] * 2
... return df
>>>
>>> df = pd.DataFrame({
... "column": range(5),
... })
>>>
>>> transform_data(df)
column doubled_column
0 0 0
1 1 2
2 2 4
3 3 6
4 4 8
See :ref:`here<decorators>` for more usage details.
"""

@wrapt.decorator
Expand Down Expand Up @@ -129,6 +163,40 @@ def check_output(
:type tail: int
:param sample: validate a random sample of n rows. Rows overlapping
with `head` or `tail` are de-duplicated.
:example:
Check the output a decorated function.
>>> import pandas as pd
>>> import pandera as pa
>>>
>>> from pandera import DataFrameSchema, Column, Check
>>>
>>>
>>> schema = DataFrameSchema(
... columns={
... "doubled_column": Column(pa.Int),
... },
... checks=Check(lambda df: df["doubled_column"] == df["column"] * 2)
... )
>>>
>>> @pa.check_output(schema)
... def transform_data(df: pd.DataFrame) -> pd.DataFrame:
... df["doubled_column"] = df["column"] * 2
... return df
>>>
>>> df = pd.DataFrame({"column": range(5)})
>>>
>>> transform_data(df)
column doubled_column
0 0 0
1 1 2
2 2 4
3 3 6
4 4 8
See :ref:`here<decorators>` for more usage details.
"""

@wrapt.decorator
Expand Down
Loading

0 comments on commit 4eeeb16

Please sign in to comment.