diff --git a/.gitignore b/.gitignore index 090fd29f..f63f6e4c 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,6 @@ jspm_packages # Extra /static/ /public/ + +# Temporary files +.#* diff --git a/.travis.yml b/.travis.yml index f202267c..d9ddc105 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,6 +47,7 @@ install: script: - make test + - make docs after_success: - pip install coveralls diff --git a/Makefile b/Makefile index 0d36fc51..f6c95804 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ install-backend: ## Install the dependencies for the backend app pip3 install --upgrade --no-cache-dir --exists-action w -r requirements.txt install-dev: ## Install the additional development dependencies for the app - pip3 install --upgrade --no-cache-dir -r requirements.dev + pip3 install --upgrade --no-cache-dir -r requirements.dev -r requirements.doc install-frontend: ## Install the dependencies for frontend development and compilation npm install @@ -116,3 +116,9 @@ server: ## Command to run the app as queue or server spec: wget -O frontend/spec.json https://raw.githubusercontent.com/frictionlessdata/data-quality-spec/master/spec.json + +docs: + sphinx-build -b html docs/ docs/_build + +docs-watch: + sphinx-autobuild docs/ docs/_build/html/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..4df13ae8 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Goodtables +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..8220e791 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Goodtables documentation build configuration file, created by +# sphinx-quickstart on Fri Dec 1 14:50:05 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +from recommonmark.parser import CommonMarkParser +from recommonmark.transform import AutoStructify + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Goodtables' +copyright = '2017, Open Knowledge International' +author = 'Open Knowledge International' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '' +# The full version, including alpha/beta/rc tags. +release = '' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = { + 'description': 'Continuous validation for tabular datasets', + 'github_user': 'frictionlessdata', + 'github_repo': 'goodtables.io', + 'github_type': 'star', + 'github_count': False, +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# This is required for the alabaster theme +# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'searchbox.html', + ] +} + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Goodtablesdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Goodtables.tex', 'Goodtables Documentation', + 'Open Knowledge International', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'goodtables', 'Goodtables Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Goodtables', 'Goodtables Documentation', + author, 'Goodtables', 'One line description of project.', + 'Miscellaneous'), +] + +source_parsers = { + '.md': CommonMarkParser, +} + + +# app setup hook +def setup(app): + app.add_config_value('recommonmark_config', { + 'enable_eval_rst': True, + }, True) + app.add_transform(AutoStructify) diff --git a/docs/configuring.md b/docs/configuring.md new file mode 100644 index 00000000..2f8242a2 --- /dev/null +++ b/docs/configuring.md @@ -0,0 +1,136 @@ +# Configuration + +Goodtables.io is configured via a `goodtables.yml` file in the root directory. For example, you can define: + +* Which files goodtables should validate +* Which spreadsheet page should be validated +* What delimiter your CSV file uses (e.g. `;`) +* Which validation checks should be executed + +The rest of this page is divided in sections on common things you want to change. For the full reference, check the [goodtables.yml file reference][gtyml-reference]. + +## Defining the files to validate + +By default goodtables validates all files with extension CSV, ODS, XLS, or XLSX, and all files named `datapackage.json`. + +You can overwrite the default files in `goodtables.yml`: + +```yaml +files: + - source: data1.csv + schema: schema1.json + - source: data2.xls + schema: schema2.json +``` + +Alternatively, you can define a pattern like: + +```yaml +files: '*.csv' +``` + +You can also configure how the file is loaded using the options: + +```eval_rst ++-----------------------------------+-----------------------------------+ +| Option | Description | ++===================================+===================================+ +| format | The file format (csv, xls, ...) | ++-----------------------------------+-----------------------------------+ +| encoding | The file encoding (utf-8, ...) | ++-----------------------------------+-----------------------------------+ +| skip_rows | Either the number of rows to | +| | skip, or an array of strings | +| | (e.g. ``#``, ``//``, ...). Rows | +| | that begin with any of the | +| | strings will be ignored. | ++-----------------------------------+-----------------------------------+ +``` + +## Validating data packages + +By default goodtables validates all files named `datapackage.json`. + +You can overwrite this default in `goodtables.yml`: + +```yaml +datapackages: + - report1/datapackage.json + - report2/datapackage.json +``` + +## Validating CSV files with custom dialects + +You can configure how the CSV file is loaded by adding one of the following options on `goodtables.yml`: + +```yaml +files: + - source: data.csv + delimiter: ; + doublequote: True + escapechar: \ + lineterminator: \r\n + quotechar: " +``` + +The entire list of options can be found on the [Python CSV formatting reference][python-csv-docs]. + +## Defining the spreadsheet page to validate + +By default goodtables validates the first sheet of a spreadsheet. + +You can overwrite the default sheet in `goodtables.yml`: + +```yaml +files: + - source: data.xlsx + sheet: 3 +``` + +## Changing the limit of rows to validate + +By default goodtables validates at most 1,000 rows. You can change it in `goodtables.yml`: + +```yaml +settings: + row_limit: 2000 +``` + +## Defining which validation checks are executed + +By default goodtables runs all validation checks. You can customize which checks are executed in `goodtables.yml`: + +```yaml +settings: + checks: + # You can pass check types + - structure + - schema + # ... or individual checks + - blank-header + - duplicate-row + - missing-value + skip_checks: + # You can also skip individual checks + - minimum-constraint +``` + +Note that if you use the `checks` setting, you have to define all checks you want to be used. Because of this, we recommend using `skip_checks` instead. + +The list of validation checks can be found on the [goodtables-py documentation][gtpy-docs]. + +## Automatically inferring the schema + +By default goodtables does not infer the data schema. You can enable inferring in `goodtables.yml`: + +```yaml +settings: + infer_schema: True + infer_fields: True +``` + +Goodtables will infer the schema of all files and columns that don't have an explicit schema. + +[gtyml-reference]: goodtables_yml.html "goodtables.yml file reference" +[python-csv-docs]: https://docs.python.org/3.6/library/csv.html#csv-fmt-params "Python CSV Formatting docs" +[gtpy-docs]: https://github.com/frictionlessdata/goodtables-py "Goodtables.py documentation" diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 00000000..0e55e970 --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,9 @@ +# Getting started + +```eval_rst +.. toctree:: + :maxdepth: 2 + + getting_started_github + writing_data_schema +``` diff --git a/docs/getting_started_github.md b/docs/getting_started_github.md new file mode 100644 index 00000000..a28aa1b5 --- /dev/null +++ b/docs/getting_started_github.md @@ -0,0 +1,29 @@ +# Validating data on GitHub + +This is a very short tutorial on using goodtables.io to continuously validate data hosted on [GitHub][github]. + +## Pre-requisites + +* Tabular data on a [GitHub repository][gh-new-repo] + +## Instructions + +1. Login on [goodtables.io][gtio] using your GitHub account and accept the permissions confirmation. +1. Once we've synchronized your repository list, go to the [Manage Sources][gtio-managesources] page and enable the repository with the data you want to validate. + * If you can't find the repository, try clicking on the Refresh button on the Manage Sources page + +Goodtables will then validate all tabular data files (CSV, XLS, XLSX, ODS) and [data packages][datapackage] in the repository. These validations will be executed on every change, including pull requests. + +## Next steps + +* Add a badge to your README to display your data validation status. The instructions are on the "Get badge" tab in the data report page. +* [Write a table schema][gtio-dataschema] to validate the contents of your data +* [Configure which files are validated and how][gtio-configuring] + +[gtio]: https://goodtables.io/ "Goodtables.io" +[github]: https://github.com/ "GitHub" +[gh-new-repo]: https://help.github.com/articles/create-a-repo/ "GitHub: Create new repository tutorial" +[gtio-managesources]: https://goodtables.io/settings "Goodtables.io: Manage sources" +[datapackage]: https://frictionlessdata.io/data-packages/ "Data Package" +[gtio-dataschema]: writing_data_schema.html "Writing a data schema" +[gtio-configuring]: configuring.html "Configuring goodtables.io" diff --git a/docs/goodtables_yml.md b/docs/goodtables_yml.md new file mode 100644 index 00000000..d1a92f76 --- /dev/null +++ b/docs/goodtables_yml.md @@ -0,0 +1,35 @@ +# The goodtables.yml + +```yaml +files: + - source: data.csv + schema: schema.json + format: csv + encoding: utf-8 + skip_rows: 3 + delimiter: ';' + + - source: data.xls + format: xls + sheet: 4 + +datapackages: + - 'datapackage.json' + +settings: + checks: + # You can pass check types + - structure + - schema + # ...or individual checks + - no-headers + - blank-headers + skip_checks: + - duplicate-lines + error_limit: 1 + table_limit: 1 + row_limit: 5000 + infer_schema: True + infer_fields: True + order_fields: True +``` diff --git a/docs/img/failed_validation.png b/docs/img/failed_validation.png new file mode 100644 index 00000000..dc38af6f Binary files /dev/null and b/docs/img/failed_validation.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..b3724f17 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,38 @@ +# goodtables.io: Continuous tabular data validation + +Goodtables increases your confidence in your data by performing a number of +checks on it. It is able to validate things like: + +* Is my CSV valid? +* Do all rows have the same number of columns? +* Are all dates valid? +* Is there any invalid e-mail? + +If any of the checks fail, goodtables generates a report telling you which and +where the errors occurred. + +[![Example report with failed validation](img/failed_validation.png)][bhx-schools] + +By adding goodtables to your data publishing workflow, you'll make sure your +data is free from these types of errors. + +## Features + +* **Structural checks**: Ensure that there are no empty rows, no blank headers, etc. +* **Content checks**: Ensure that the values have the correct types ("string", "number", "date", etc.), that their format is valid ("string must be an e-mail"), and that they respect the constraints ("age must be a number greater than 18"). +* **Support for multiple tabular formats**: CSV, Excel, LibreOffice, Data Package, etc. +* **Automatically validate on every update on GitHub** + +## Table of Contents + +```eval_rst +.. toctree:: + :maxdepth: 2 + + getting_started + configuring + goodtables_yml +``` + + +[bhx-schools]: https://goodtables.io/github/vitorbaptista/birmingham_schools "Birmingham Schools validation report" diff --git a/docs/writing_data_schema.md b/docs/writing_data_schema.md new file mode 100644 index 00000000..d431c994 --- /dev/null +++ b/docs/writing_data_schema.md @@ -0,0 +1,102 @@ +# Describing your data schema + +Without knowledge of the data structure, goodtables is only able to check if the structure of the data is valid. For example, that all rows have the same number of columns, that there are no blank headers, etc. To validate the actual contents, you need to describe the data schema. + +The data schema describes what each column should have (strings, numbers, dates), their formats (this string should be an e-mail), and constraints (numbers on age column should be bigger than 18). You can think of it as a kind of data dictionary. The best way to describe the data schema is by writing a data package. + +## Instructions + +On the root folder of your data, create a `datapackage.json` file with the contents: + +```json +{ + "name": "my-dataset", + "title": "My dataset", + "resources": [ + { + "name": "my-data", + "path": "data/data.csv" + } + ] +} +``` + +This is the simplest tabular data package we can create. Let's see how our data looks like so we can write the table schema for it. + +```eval_rst ++------------+------+------+--------+ +| date | from | to | amount | ++============+======+======+========+ +| 2017-01-01 | Jane | John | 1000 | ++------------+------+------+--------+ +| 2017-01-15 | Jane | Paul | 500 | ++------------+------+------+--------+ +| 2017-02-03 | John | Jane | 2000 | ++------------+------+------+--------+ +``` + +A table schema has three parts: the data type ("string", "number", "date"), the data format ("e-mail", "URI", "ISO date"), and the constraints ("number must be above 18"). Not all columns will have all three parts. + +In our data, we have the following columns: + +```eval_rst ++--------+---------+------------+-----------------------+ +| column | type | format | constraints | ++========+=========+============+=======================+ +| date | date | YYYY-MM-DD | | ++--------+---------+------------+-----------------------+ +| from | string | | | ++--------+---------+------------+-----------------------+ +| to | string | | | ++--------+---------+------------+-----------------------+ +| amount | numeric | | Greater or equal to 0 | ++--------+---------+------------+-----------------------+ +``` + +Writing this as a table schema in our data package, we have: + +```json +{ + "name": "my-dataset", + "title": "My dataset", + "resources": [ + { + "name": "my-data", + "path": "data/data.csv", + "schema": { + "fields": [ + { + "name": "date", + "type": "date", + "description": "The transaction date" + }, + { + "name": "from", + "type": "string", + "description": "Payer" + }, + { + "name": "to", + "type": "string", + "description": "Payee" + }, + { + "name": "amount", + "type": "numeric", + "description": "Transaction value in Euros", + "constraints": { + "minimum": 0 + } + } + ] + } + } + ] +} +``` + +Note that we didn't have to define the date format explicitly, as the default format is YYYY-MM-DD. + +You can find all supported data types, formats and constraints in the [Table Schema specification][tableschema]. + +[tableschema]: https://frictionlessdata.io/specs/table-schema/ "Table Schema Specification" diff --git a/requirements.doc b/requirements.doc new file mode 100644 index 00000000..64007dde --- /dev/null +++ b/requirements.doc @@ -0,0 +1,3 @@ +sphinx +recommonmark +sphinx-autobuild