Sphinx build

multimeric · Dec 13, 2016 · a033af2 · a033af2
1 parent dfa02a6
commit a033af2
Show file tree

Hide file tree

Showing 14 changed files with 616 additions and 80 deletions.
diff --git a/.gitignore b/.gitignore
@@ -89,4 +89,9 @@ ENV/
 .ropeproject
 
 # Custom
-.idea
+.idea
+
+.doctrees
+
+doc/_*
+doc/Makefile
diff --git a/README.rst b/README.rst
@@ -1,49 +1,198 @@
+
 PandasSchema
-============
+************
+
 
 Introduction
-------------
-PandasSchema is a module for validating tabulated data, such as CSVs (Comma Separated Value files), and TSVs (Tab
-Separated Value files). It uses the incredibly powerful data analysis tool pandas to do so quickly and efficiently.
+============
+
+PandasSchema is a module for validating tabulated data, such as CSVs
+(Comma Separated Value files), and TSVs (Tab Separated Value files).
+It uses the incredibly powerful data analysis tool pandas to do so
+quickly and efficiently.
 
 For example, say your code expects a CSV that looks a bit like this:
 
-..
-    Given Name,Family Name,Age,Sex,Customer ID
-    Gerald,Hampton,82,Male,2582GABK
-    Yuuwa,Miyake,27,Male,7951WVLW
-    Edyta,Majewska,50,Female,7758NSID
+Now you want to be able to ensure that the data in your CSV is in the
+correct format:
+
+::
 
-Now you want to be able to ensure that the data in your CSV is in the correct format:
+   >>> import pandas as pd
+   >>> from io import StringIO
+   >>> from pandas_schema import Column, Schema
+   >>> from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesRegexValidation, InRangeValidation, InListValidation
 
-.. code:: python
-    import pandas as pd
-    from io import StringIO
-    from pandas_schema import Column, Schema
-    from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesRegexValidation, InRangeValidation, InListValidation
+   >>> schema = Schema([
+   ...     Column('Given Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
+   ...     Column('Family Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
+   ...     Column('Age', [InRangeValidation(0, 120)]),
+   ...     Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]),
+   ...     Column('Customer ID', [MatchesRegexValidation(r'\d{4}[A-Z]{4}')])
+   ... ])
 
-    schema = Schema([
-        Column('Given Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
-        Column('Family Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
-        Column('Age', [InRangeValidation(0, 120)]),
-        Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]),
-        Column('Customer ID', [MatchesRegexValidation(r'\d{4}[A-Z]{4}')])
-    ])
+   >>> test_data = pd.read_csv(StringIO('''Given Name,Family Name,Age,Sex,Customer ID
+   ... Gerald ,Hampton,82,Male,2582GABK
+   ... Yuuwa,Miyake,270,male,7951WVLW
+   ... Edyta,Majewska ,50,Female,775ANSID
+   ... '''))
 
-    test_data = pd.from_csv(StringIO('''
-        Gerald ,Hampton,82,Male,2582GABK
-        Yuuwa,Miyake,270,male,7951WVLW
-        Edyta,Majewska ,50,Female,775ANSID
-    '''))
+   >>> errors = schema.validate(test_data)
 
-    schema.validate()
+   >>> for error in errors:
+   ...    print(error)
+   {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace
+   {row: 1, column: "Age"}: "270" was not in the range [0, 120)
+   {row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other)
+   {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace
+   {row: 2, column: "Customer ID"}: "775ANSID" does not match the regex "\d{4}[A-Z]{4}"
 
 
 Installation
-------------
+============
+
 Install PandasSchema using pip:
 
-.. code:: bash
-pip install pandas_schema
+::
+
+   pip install pandas_schema
+
+
+API
+===
+
+
+Schema
+------
+
+**class schema.Schema(columns: typing.Iterable[column.Column],
+ordered: bool = False)**
+
+   A schema that defines the columns required in the target DataFrame
+
+   Creates a new pandas schema :param columns: A list of column
+   objects :param ordered: True if the data frame must be in the same
+   order as the schema. Defaults to False
+
+   **validate(df: pandas.core.frame.DataFrame) ->
+   typing.List[validation_warning.ValidationWarning]**
+
+      Runs a full validation of the target DataFrame using the
+      internal columns list :param df: A pandas DataFrame to validate
+      :return: A list of ValidationWarning objects that list the ways
+      in which the DataFrame was invalid
+
+
+Column
+------
+
+**class column.Column(name: str, validations:
+typing.Iterable[typing.validation.BaseValidation] = [],
+allow_empty=False)**
+
+   Creates a new Column object :param name: The column header that
+   defines this column. This must be identical to the header used in
+   the CSV/Data Frame you are validating. :param validations: An
+   iterable of objects implementing BaseValidation that will generate
+   ValidationErrors :param allow_empty: True if an empty column is
+   considered valid. False if we leave that logic up to the Validation
+
+   **validate(series: pandas.core.series.Series) ->
+   typing.List[validation_warning.ValidationWarning]**
+
+      Creates a list of validation errors using the Validation objects
+      contained in the Column :param series: A pandas Series to
+      validate :return: An iterable of ValidationError instances
+      generated by the validation
+
+
+Validation
+----------
+
+**class validation.BaseValidation**
+
+   The validation base class that defines any object that can create a
+   list of errors from a Series
+
+   **get_errors(series: pandas.core.series.Series, column:
+   column.Column) ->
+   typing.Iterable[validation_warning.ValidationWarning]**
+
+      Return a list of errors in the given series :param series:
+      :param column: :return:
+
+**class validation.CanCallValidation(func)**
+
+   Validates if a given function can be called on each element in a
+   column without raising an exception
+
+**class validation.CanConvertValidation(_type)**
+
+   Checks if each element in a column can be converted to a Python
+   object type
+
+**class validation.CustomValidation(validation:
+typing.Callable[[pandas.core.series.Series],
+pandas.core.series.Series], message: str)**
+
+   Validates using a user-provided function and message.
+
+   Creates a new validation object :param message: The error message
+   to provide to the user if this validation fails :param validation:
+   A function that takes a pandas series and returns a boolean series,
+   where the cell is equal to True if the object passed validation,
+   and False if it failed
+
+**class validation.DateFormatValidation(date_format: str)**
+
+   Checks that each element in this column is a valid date according
+   to a provided format string
+
+**class validation.ElementValidation**
+
+   Implements the BaseValidation interface by returning a Boolean
+   series for each element that either passes or fails the validation
+
+   **get_message() -> str**
+
+      Create a message to be displayed whenever this validation fails
+      :param value: The value of the failing object (Series, or single
+      value)
+
+   **validate(series: pandas.core.series.Series) ->
+   pandas.core.series.Series**
+
+      Returns a Boolean series, where each value of False is an
+      element in the Series that has failed the validation :param
+      series: :return:
+
+**class validation.InListValidation(options: typing.Iterable)**
+
+   Checks that each element in this column is contained within a list
+   of possibilities
+
+**class validation.InRangeValidation(min=-inf, max=inf)**
+
+   Checks that each element in the series is within a given numerical
+   range
+
+**class validation.IsDtypeValidation(dtype: numpy.dtype)**
+
+   Checks that a series has a certain numpy dtype
+
+**class validation.LeadingWhitespaceValidation**
+
+   Checks that there is no trailing whitespace in this column
+
+**class validation.MatchesRegexValidation(regex: Pattern[~AnyStr])**
+
+   Validates that a regular expression can match somewhere in each
+   element in this column
+
+   :Parameters:
+      **regex** -- A regular expression object, created using
+      re.compile or similar
 
+**class validation.TrailingWhitespaceValidation**
 
+   Checks that there is no trailing whitespace in this column
diff --git a/__init__.py b/__init__.py
@@ -1,4 +1,4 @@
 from column import Column
-from validation_error import ValidationError
+from validation_warning import ValidationWarning
 from schema import Schema
 import validation
diff --git a/column.py b/column.py
@@ -1,7 +1,7 @@
 import typing
 import validation
 import pandas as pd
-from validation_error import ValidationError
+from validation_warning import ValidationWarning
 
 
 class Column:
@@ -17,7 +17,7 @@ def __init__(self, name: str, validations: typing.Iterable['validation.BaseValid
         self.validations = list(validations)
         self.allow_empty = allow_empty
 
-    def validate(self, series: pd.Series) -> typing.List[ValidationError]:
+    def validate(self, series: pd.Series) -> typing.List[ValidationWarning]:
         """
         Creates a list of validation errors using the Validation objects contained in the Column
         :param series: A pandas Series to validate

diff --git a/doc/README.rst b/doc/README.rst
@@ -0,0 +1,77 @@
+PandasSchema
+============
+
+Introduction
+------------
+PandasSchema is a module for validating tabulated data, such as CSVs (Comma Separated Value files), and TSVs (Tab
+Separated Value files). It uses the incredibly powerful data analysis tool pandas to do so quickly and efficiently.
+
+For example, say your code expects a CSV that looks a bit like this:
+
+..
+    Given Name,Family Name,Age,Sex,Customer ID
+    Gerald,Hampton,82,Male,2582GABK
+    Yuuwa,Miyake,27,Male,7951WVLW
+    Edyta,Majewska,50,Female,7758NSID
+
+Now you want to be able to ensure that the data in your CSV is in the correct format:
+
+.. code::
+
+    >>> import pandas as pd
+    >>> from io import StringIO
+    >>> from pandas_schema import Column, Schema
+    >>> from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesRegexValidation, InRangeValidation, InListValidation
+
+    >>> schema = Schema([
+    ...     Column('Given Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
+    ...     Column('Family Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
+    ...     Column('Age', [InRangeValidation(0, 120)]),
+    ...     Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]),
+    ...     Column('Customer ID', [MatchesRegexValidation(r'\d{4}[A-Z]{4}')])
+    ... ])
+
+    >>> test_data = pd.read_csv(StringIO('''Given Name,Family Name,Age,Sex,Customer ID
+    ... Gerald ,Hampton,82,Male,2582GABK
+    ... Yuuwa,Miyake,270,male,7951WVLW
+    ... Edyta,Majewska ,50,Female,775ANSID
+    ... '''))
+
+    >>> errors = schema.validate(test_data)
+
+    >>> for error in errors:
+    ...    print(error)
+    {row: 0, column: "Given Name"}: "Gerald " contains trailing whitespace
+    {row: 1, column: "Age"}: "270" was not in the range [0, 120)
+    {row: 1, column: "Sex"}: "male" is not in the list of legal options (Male, Female, Other)
+    {row: 2, column: "Family Name"}: "Majewska " contains trailing whitespace
+    {row: 2, column: "Customer ID"}: "775ANSID" does not match the regex "\d{4}[A-Z]{4}"
+
+
+Installation
+------------
+Install PandasSchema using pip:
+
+.. code:: bash
+
+    pip install pandas_schema
+
+API
+---
+Schema
+~~~~~~
+.. py:currentmodule:: schema
+.. autoclass:: Schema
+    :members:
+
+Column
+~~~~~~
+.. py:currentmodule:: column
+.. autoclass:: Column
+    :members:
+
+Validation
+~~~~~~~~~~
+.. automodule:: validation
+    :members:
+    :exclude-members: BaseValidation ElementValidation