feat(glue): add L2 resources for Database and Table (#1988)

aws · Mar 14, 2019 · 3117cd3 · 3117cd3
1 parent a1df717
commit 3117cd3
Show file tree

Hide file tree

Showing 14 changed files with 76,173 additions and 11 deletions.
diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md
@@ -1,2 +1,187 @@
 ## The CDK Construct Library for AWS Glue
 This module is part of the [AWS Cloud Development Kit](https://github.com/awslabs/aws-cdk) project.
+
+### Database
+
+A `Database` is a logical grouping of `Tables` in the Glue Catalog.
+
+```ts
+new glue.Database(stack, 'MyDatabase', {
+  databaseName: 'my_database'
+});
+```
+
+By default, a S3 bucket is created and the Database is stored under  `s3://<bucket-name>/`, but you can manually specify another location:
+
+```ts
+new glue.Database(stack, 'MyDatabase', {
+  databaseName: 'my_database',
+  locationUri: 's3://explicit-bucket/some-path/'
+});
+```
+
+### Table
+
+A Glue table describes a table of data in S3: its structure (column names and types), location of data (S3 objects with a common prefix in a S3 bucket), and format for the files (Json, Avro, Parquet, etc.):
+
+```ts
+new glue.Table(stack, 'MyTable', {
+  database: myDatabase,
+  tableName: 'my_table',
+  columns: [{
+    name: 'col1',
+    type: glue.Schema.string,
+  }, {
+    name: 'col2',
+    type: glue.Schema.array(Schema.string),
+    comment: 'col2 is an array of strings' // comment is optional
+  }]
+  dataFormat: glue.DataFormat.Json
+});
+```
+
+By default, a S3 bucket will be created to store the table's data but you can manually pass the `bucket` and `s3Prefix`:
+
+```ts
+new glue.Table(stack, 'MyTable', {
+  bucket: myBucket,
+  s3Prefix: 'my-table/'
+  ...
+});
+```
+
+#### Partitions
+
+To improve query performance, a table can specify `partitionKeys` on which data is stored and queried separately. For example, you might partition a table by `year` and `month` to optimize queries based on a time window:
+
+```ts
+new glue.Table(stack, 'MyTable', {
+  database: myDatabase,
+  tableName: 'my_table',
+  columns: [{
+    name: 'col1',
+    type: glue.Schema.string
+  }],
+  partitionKeys: [{
+    name: 'year',
+    type: glue.Schema.smallint
+  }, {
+    name: 'month',
+    type: glue.Schema.smallint
+  }],
+  dataFormat: glue.DataFormat.Json
+});
+```
+
+### [Encryption](https://docs.aws.amazon.com/athena/latest/ug/encryption.html)
+
+You can enable encryption on a Table's data:
+* `Unencrypted` - files are not encrypted. The default encryption setting.
+* [S3Managed](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingServerSideEncryption.html) - Server side encryption (`SSE-S3`) with an Amazon S3-managed key.
+```ts
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.S3Managed
+  ...
+});
+```
+* [Kms](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - Server-side encryption (`SSE-KMS`) with an AWS KMS Key managed by the account owner.
+
+```ts
+// KMS key is created automatically
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.Kms
+  ...
+});
+
+// with an explicit KMS key
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.Kms,
+  encryptionKey: new kms.EncryptionKey(stack, 'MyKey')
+  ...
+});
+```
+* [KmsManaged](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - Server-side encryption (`SSE-KMS`), like `Kms`, except with an AWS KMS Key managed by the AWS Key Management Service.
+```ts
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.KmsManaged
+  ...
+});
+```
+* [ClientSideKms](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingClientSideEncryption.html#client-side-encryption-kms-managed-master-key-intro) - Client-side encryption (`CSE-KMS`) with an AWS KMS Key managed by the account owner.
+```ts
+// KMS key is created automatically
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.ClientSideKms
+  ...
+});
+
+// with an explicit KMS key
+new glue.Table(stack, 'MyTable', {
+  encryption: glue.TableEncryption.ClientSideKms,
+  encryptionKey: new kms.EncryptionKey(stack, 'MyKey')
+  ...
+});
+```
+
+*Note: you cannot provide a `Bucket` when creating the `Table` if you wish to use server-side encryption (`Kms`, `KmsManaged` or `S3Managed`)*.
+
+### Types
+
+A table's schema is a collection of columns, each of which have a `name` and a `type`. Types are recursive structures, consisting of primitive and complex types:
+
+```ts
+new glue.Table(stack, 'MyTable', {
+  columns: [{
+    name: 'primitive_column',
+    type: glue.Schema.string
+  }, {
+    name: 'array_column',
+    type: glue.Schema.array(glue.Schema.integer),
+    comment: 'array<integer>'
+  }, {
+    name: 'map_column',
+    type: glue.Schema.map(
+      glue.Schema.string,
+      glue.Schema.timestamp),
+    comment: 'map<string,string>'
+  }, {
+    name: 'struct_column',
+    type: glue.Schema.struct([{
+      name: 'nested_column',
+      type: glue.Schema.date,
+      comment: 'nested comment'
+    }]),
+    comment: "struct<nested_column:date COMMENT 'nested comment'>"
+  }],
+  ...
+```
+
+#### Primitive
+
+Numeric:
+* `bigint`
+* `float`
+* `integer`
+* `smallint`
+* `tinyint`
+
+Date and Time:
+* `date`
+* `timestamp`
+
+String Types:
+
+* `string`
+* `decimal`
+* `char`
+* `varchar`
+
+Misc:
+* `boolean`
+* `binary`
+
+#### Complex
+
+* `array` - array of some other type
+* `map` - map of some primitive key type to any value type.
+* `struct` - nested structure containing individually named and typed columns.
diff --git a/packages/@aws-cdk/aws-glue/lib/data-format.ts b/packages/@aws-cdk/aws-glue/lib/data-format.ts
@@ -0,0 +1,83 @@
+/**
+ * Absolute class name of the Hadoop `InputFormat` to use when reading table files.
+ */
+export class InputFormat {
+  /**
+   * An InputFormat for plain text files. Files are broken into lines. Either linefeed or
+   * carriage-return are used to signal end of line. Keys are the position in the file, and
+   * values are the line of text.
+   *
+   * @see https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/TextInputFormat.html
+   */
+  public static readonly TextInputFormat = new InputFormat('org.apache.hadoop.mapred.TextInputFormat');
+
+  constructor(public readonly className: string) {}
+}
+
+/**
+ * Absolute class name of the Hadoop `OutputFormat` to use when writing table files.
+ */
+export class OutputFormat {
+  /**
+   * Writes text data with a null key (value only).
+   *
+   * @see https://hive.apache.org/javadocs/r2.2.0/api/org/apache/hadoop/hive/ql/io/HiveIgnoreKeyTextOutputFormat.html
+   */
+  public static readonly HiveIgnoreKeyTextOutputFormat = new OutputFormat('org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat');
+
+  constructor(public readonly className: string) {}
+}
+
+/**
+ * Serialization library to use when serializing/deserializing (SerDe) table records.
+ *
+ * @see https://cwiki.apache.org/confluence/display/Hive/SerDe
+ */
+export class SerializationLibrary {
+  /**
+   * @see https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-JSON
+   */
+  public static readonly HiveJson = new SerializationLibrary('org.apache.hive.hcatalog.data.JsonSerDe');
+
+  /**
+   * @see https://github.com/rcongiu/Hive-JSON-Serde
+   */
+  public static readonly OpenXJson = new SerializationLibrary('org.openx.data.jsonserde.JsonSerDe');
+
+  constructor(public readonly className: string) {}
+}
+
+/**
+ * Defines the input/output formats and ser/de for a single DataFormat.
+ */
+export interface DataFormat {
+  /**
+   * `InputFormat` for this data format.
+   */
+  inputFormat: InputFormat;
+
+  /**
+   * `OutputFormat` for this data format.
+   */
+  outputFormat: OutputFormat;
+
+  /**
+   * Serialization library for this data format.
+   */
+  serializationLibrary: SerializationLibrary;
+}
+
+export namespace DataFormat {
+  /**
+   * Stored as plain text files in JSON format.
+   *
+   * Uses OpenX Json SerDe for serialization and deseralization.
+   *
+   * @see https://docs.aws.amazon.com/athena/latest/ug/json.html
+   */
+  export const Json: DataFormat = {
+    inputFormat: InputFormat.TextInputFormat,
+    outputFormat: OutputFormat.HiveIgnoreKeyTextOutputFormat,
+    serializationLibrary: SerializationLibrary.OpenXJson
+  };
+}