-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Sam Goodwin
authored
Mar 14, 2019
1 parent
a1df717
commit 3117cd3
Showing
14 changed files
with
76,173 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,187 @@ | ||
## The CDK Construct Library for AWS Glue | ||
This module is part of the [AWS Cloud Development Kit](https://github.com/awslabs/aws-cdk) project. | ||
|
||
### Database | ||
|
||
A `Database` is a logical grouping of `Tables` in the Glue Catalog. | ||
|
||
```ts | ||
new glue.Database(stack, 'MyDatabase', { | ||
databaseName: 'my_database' | ||
}); | ||
``` | ||
|
||
By default, a S3 bucket is created and the Database is stored under `s3://<bucket-name>/`, but you can manually specify another location: | ||
|
||
```ts | ||
new glue.Database(stack, 'MyDatabase', { | ||
databaseName: 'my_database', | ||
locationUri: 's3://explicit-bucket/some-path/' | ||
}); | ||
``` | ||
|
||
### Table | ||
|
||
A Glue table describes a table of data in S3: its structure (column names and types), location of data (S3 objects with a common prefix in a S3 bucket), and format for the files (Json, Avro, Parquet, etc.): | ||
|
||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
database: myDatabase, | ||
tableName: 'my_table', | ||
columns: [{ | ||
name: 'col1', | ||
type: glue.Schema.string, | ||
}, { | ||
name: 'col2', | ||
type: glue.Schema.array(Schema.string), | ||
comment: 'col2 is an array of strings' // comment is optional | ||
}] | ||
dataFormat: glue.DataFormat.Json | ||
}); | ||
``` | ||
|
||
By default, a S3 bucket will be created to store the table's data but you can manually pass the `bucket` and `s3Prefix`: | ||
|
||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
bucket: myBucket, | ||
s3Prefix: 'my-table/' | ||
... | ||
}); | ||
``` | ||
|
||
#### Partitions | ||
|
||
To improve query performance, a table can specify `partitionKeys` on which data is stored and queried separately. For example, you might partition a table by `year` and `month` to optimize queries based on a time window: | ||
|
||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
database: myDatabase, | ||
tableName: 'my_table', | ||
columns: [{ | ||
name: 'col1', | ||
type: glue.Schema.string | ||
}], | ||
partitionKeys: [{ | ||
name: 'year', | ||
type: glue.Schema.smallint | ||
}, { | ||
name: 'month', | ||
type: glue.Schema.smallint | ||
}], | ||
dataFormat: glue.DataFormat.Json | ||
}); | ||
``` | ||
|
||
### [Encryption](https://docs.aws.amazon.com/athena/latest/ug/encryption.html) | ||
|
||
You can enable encryption on a Table's data: | ||
* `Unencrypted` - files are not encrypted. The default encryption setting. | ||
* [S3Managed](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingServerSideEncryption.html) - Server side encryption (`SSE-S3`) with an Amazon S3-managed key. | ||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.S3Managed | ||
... | ||
}); | ||
``` | ||
* [Kms](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - Server-side encryption (`SSE-KMS`) with an AWS KMS Key managed by the account owner. | ||
|
||
```ts | ||
// KMS key is created automatically | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.Kms | ||
... | ||
}); | ||
|
||
// with an explicit KMS key | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.Kms, | ||
encryptionKey: new kms.EncryptionKey(stack, 'MyKey') | ||
... | ||
}); | ||
``` | ||
* [KmsManaged](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - Server-side encryption (`SSE-KMS`), like `Kms`, except with an AWS KMS Key managed by the AWS Key Management Service. | ||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.KmsManaged | ||
... | ||
}); | ||
``` | ||
* [ClientSideKms](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingClientSideEncryption.html#client-side-encryption-kms-managed-master-key-intro) - Client-side encryption (`CSE-KMS`) with an AWS KMS Key managed by the account owner. | ||
```ts | ||
// KMS key is created automatically | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.ClientSideKms | ||
... | ||
}); | ||
|
||
// with an explicit KMS key | ||
new glue.Table(stack, 'MyTable', { | ||
encryption: glue.TableEncryption.ClientSideKms, | ||
encryptionKey: new kms.EncryptionKey(stack, 'MyKey') | ||
... | ||
}); | ||
``` | ||
|
||
*Note: you cannot provide a `Bucket` when creating the `Table` if you wish to use server-side encryption (`Kms`, `KmsManaged` or `S3Managed`)*. | ||
|
||
### Types | ||
|
||
A table's schema is a collection of columns, each of which have a `name` and a `type`. Types are recursive structures, consisting of primitive and complex types: | ||
|
||
```ts | ||
new glue.Table(stack, 'MyTable', { | ||
columns: [{ | ||
name: 'primitive_column', | ||
type: glue.Schema.string | ||
}, { | ||
name: 'array_column', | ||
type: glue.Schema.array(glue.Schema.integer), | ||
comment: 'array<integer>' | ||
}, { | ||
name: 'map_column', | ||
type: glue.Schema.map( | ||
glue.Schema.string, | ||
glue.Schema.timestamp), | ||
comment: 'map<string,string>' | ||
}, { | ||
name: 'struct_column', | ||
type: glue.Schema.struct([{ | ||
name: 'nested_column', | ||
type: glue.Schema.date, | ||
comment: 'nested comment' | ||
}]), | ||
comment: "struct<nested_column:date COMMENT 'nested comment'>" | ||
}], | ||
... | ||
``` | ||
#### Primitive | ||
Numeric: | ||
* `bigint` | ||
* `float` | ||
* `integer` | ||
* `smallint` | ||
* `tinyint` | ||
Date and Time: | ||
* `date` | ||
* `timestamp` | ||
String Types: | ||
* `string` | ||
* `decimal` | ||
* `char` | ||
* `varchar` | ||
Misc: | ||
* `boolean` | ||
* `binary` | ||
#### Complex | ||
* `array` - array of some other type | ||
* `map` - map of some primitive key type to any value type. | ||
* `struct` - nested structure containing individually named and typed columns. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/** | ||
* Absolute class name of the Hadoop `InputFormat` to use when reading table files. | ||
*/ | ||
export class InputFormat { | ||
/** | ||
* An InputFormat for plain text files. Files are broken into lines. Either linefeed or | ||
* carriage-return are used to signal end of line. Keys are the position in the file, and | ||
* values are the line of text. | ||
* | ||
* @see https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/TextInputFormat.html | ||
*/ | ||
public static readonly TextInputFormat = new InputFormat('org.apache.hadoop.mapred.TextInputFormat'); | ||
|
||
constructor(public readonly className: string) {} | ||
} | ||
|
||
/** | ||
* Absolute class name of the Hadoop `OutputFormat` to use when writing table files. | ||
*/ | ||
export class OutputFormat { | ||
/** | ||
* Writes text data with a null key (value only). | ||
* | ||
* @see https://hive.apache.org/javadocs/r2.2.0/api/org/apache/hadoop/hive/ql/io/HiveIgnoreKeyTextOutputFormat.html | ||
*/ | ||
public static readonly HiveIgnoreKeyTextOutputFormat = new OutputFormat('org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'); | ||
|
||
constructor(public readonly className: string) {} | ||
} | ||
|
||
/** | ||
* Serialization library to use when serializing/deserializing (SerDe) table records. | ||
* | ||
* @see https://cwiki.apache.org/confluence/display/Hive/SerDe | ||
*/ | ||
export class SerializationLibrary { | ||
/** | ||
* @see https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-JSON | ||
*/ | ||
public static readonly HiveJson = new SerializationLibrary('org.apache.hive.hcatalog.data.JsonSerDe'); | ||
|
||
/** | ||
* @see https://github.com/rcongiu/Hive-JSON-Serde | ||
*/ | ||
public static readonly OpenXJson = new SerializationLibrary('org.openx.data.jsonserde.JsonSerDe'); | ||
|
||
constructor(public readonly className: string) {} | ||
} | ||
|
||
/** | ||
* Defines the input/output formats and ser/de for a single DataFormat. | ||
*/ | ||
export interface DataFormat { | ||
/** | ||
* `InputFormat` for this data format. | ||
*/ | ||
inputFormat: InputFormat; | ||
|
||
/** | ||
* `OutputFormat` for this data format. | ||
*/ | ||
outputFormat: OutputFormat; | ||
|
||
/** | ||
* Serialization library for this data format. | ||
*/ | ||
serializationLibrary: SerializationLibrary; | ||
} | ||
|
||
export namespace DataFormat { | ||
/** | ||
* Stored as plain text files in JSON format. | ||
* | ||
* Uses OpenX Json SerDe for serialization and deseralization. | ||
* | ||
* @see https://docs.aws.amazon.com/athena/latest/ug/json.html | ||
*/ | ||
export const Json: DataFormat = { | ||
inputFormat: InputFormat.TextInputFormat, | ||
outputFormat: OutputFormat.HiveIgnoreKeyTextOutputFormat, | ||
serializationLibrary: SerializationLibrary.OpenXJson | ||
}; | ||
} |
Oops, something went wrong.