From fc1bee3849e681f68d322b7d71a5c5643d8ecc19 Mon Sep 17 00:00:00 2001 From: Vitor Baptista Date: Fri, 15 Dec 2017 20:48:25 +0000 Subject: [PATCH] [#213] Add Amazon S3 documentation --- docs/getting_started.md | 1 + docs/getting_started_s3.md | 84 ++++++++++++++++++++++++++++++++++++++ docs/index.md | 4 +- 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 docs/getting_started_s3.md diff --git a/docs/getting_started.md b/docs/getting_started.md index 0e55e970..279f913c 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -5,5 +5,6 @@ :maxdepth: 2 getting_started_github + getting_started_s3 writing_data_schema ``` diff --git a/docs/getting_started_s3.md b/docs/getting_started_s3.md new file mode 100644 index 00000000..2b9f3565 --- /dev/null +++ b/docs/getting_started_s3.md @@ -0,0 +1,84 @@ +# Validating data on Amazon S3 + +This is a very short tutorial on using goodtables.io to continuously validate data hosted on [Amazon S3][s3]. + +## Pre-requisites + +* A [GitHub][github] login +* An [Amazon S3][s3] login + +## Instructions + +### Setting up Amazon S3 bucket and read-only user + +1. [Create a bucket on S3][howto-s3bucket] to hold your data + * Create the bucket on the `us-west-2` region. It's a [current limitation][s3-region-bug] of goodtables.io that we're working to fix. +1. [Create a new IAM user][howto-iamuser]. This user will be used by goodtables.io to read your bucket. + * Make sure you take note of the AWS Access Key ID, AWS Secret Access Key, and the User ARN. +1. Go to your [bucket's overview page][bucket-overview], click on the `Permissions` tab, and find the `Bucket Policy` link. We need the permissions: + * _s3:ListBucket_: To list the bucket's contents + * _s3:GetObject_: To read the bucket's files + * _s3:GetBucketPolicy_, _s3:PutBucketPolicy_, _s3:GetBucketLocation_, and _s3:PutBucketNotification_: To set up the AWS Lambda functions that notifies goodtables.io when a new file is added. + +The final bucket policy should look like: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "statement1", + "Effect": "Allow", + "Principal": { + "AWS": "IAM_USER_ARN" + }, + "Action": [ + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:GetBucketPolicy", + "s3:PutBucketPolicy", + "s3:PutBucketNotification" + ], + "Resource": "arn:aws:s3:::BUCKET_NAME" + }, + { + "Sid": "statement2", + "Effect": "Allow", + "Principal": { + "AWS": "IAM_USER_ARN" + }, + "Action": ["s3:GetObject"], + "Resource": "arn:aws:s3:::BUCKET_NAME/*" + } + ] +} +``` + +With your IAM User ARN and Bucket Name substituting the `IAM_USER_ARN` and `BUCKET_NAME`. + +### Setting up goodtables.io + +1. Login on [goodtables.io][gtio] using your GitHub account. +1. Go to the [Manage Sources][gtio-managesources] page, click on the `Amazon` tab, and on the plus sign on the right of the Filter input. +1. Fill in the `Access Key Id`, `Secret Access Key` and `Bucket Name` with the IAM User and bucket you just created in the previous section. + +We're all set. Goodtables will automatically validate whenever a file is added or modified in the bucket. You can now [upload data to your bucket][howto-s3upload] and goodtables will automatically validate any tabular files (CSV, XLS, ODS, ...) and tabular data packages. + +## Next steps + +* [Write a table schema][gtio-dataschema] to validate the contents of your data +* [Configure which files are validated and how][gtio-configuring] + +[gtio]: https://goodtables.io/ "Goodtables.io" +[github]: https://github.com/ "GitHub" +[s3]: https://aws.amazon.com/s3/ "Amazon S3" +[s3-region-bug]: https://github.com/frictionlessdata/goodtables.io/issues/136 "Can't add S3 bucket with other region that Oregon (us-west-2)" +[howto-s3bucket]: https://docs.aws.amazon.com/AmazonS3/latest/user-guide/create-bucket.html "How do I create an S3 Bucket?" +[howto-s3upload]: https://docs.aws.amazon.com/AmazonS3/latest/user-guide/upload-objects.html "How do I upload files and folders to an S3 Bucket?" +[howto-iamuser]: http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html?icmpid=docs_iam_console "Create an IAM User in your AWS account" +[bucket-overview]: https://s3.console.aws.amazon.com/s3/buckets/ "Amazon S3 Bucket list" +[gh-new-repo]: https://help.github.com/articles/create-a-repo/ "GitHub: Create new repository tutorial" +[gtio-managesources]: https://goodtables.io/settings "Goodtables.io: Manage sources" +[datapackage]: https://frictionlessdata.io/data-packages/ "Data Package" +[gtio-dataschema]: writing_data_schema.html "Writing a data schema" +[gtio-configuring]: configuring.html "Configuring goodtables.io" diff --git a/docs/index.md b/docs/index.md index 0b1342f7..a0a797ff 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ data is free from these types of errors. * **Structural checks**: Ensure that there are no empty rows, no blank headers, etc. * **Content checks**: Ensure that the values have the correct types ("string", "number", "date", etc.), that their format is valid ("string must be an e-mail"), and that they respect the constraints ("age must be a number greater than 18"). * **Support for multiple tabular formats**: CSV, Excel, LibreOffice, Data Package, etc. -* **Automatically validate on every update on GitHub** +* **Automatically validate data on every update**: Support for data on [GitHub][gettingstarted-github] and [Amazon S3][gettingstarted-s3]. ## Table of Contents @@ -37,3 +37,5 @@ data is free from these types of errors. [bhx-schools]: https://goodtables.io/github/vitorbaptista/birmingham_schools "Birmingham Schools validation report" +[gettingstarted-github]: getting_started_github.html "Validating data on GitHub" +[gettingstarted-s3]: getting_started_s3.html "Validating data on Amazon S3"