Skip to content

Commit

Permalink
change to add resource share
Browse files Browse the repository at this point in the history
  • Loading branch information
randyridgley committed May 10, 2022
1 parent 52207a7 commit 97ba939
Show file tree
Hide file tree
Showing 8 changed files with 907 additions and 125 deletions.
33 changes: 15 additions & 18 deletions API.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,6 @@ new DataLake(scope: Construct, id: string, props: DataLakeProperties)

#### Methods <a name="Methods"></a>

##### `createCrossAccountGlueCatalogResourcePolicy` <a name="@randyridgley/cdk-datalake-constructs.DataLake.createCrossAccountGlueCatalogResourcePolicy"></a>

```typescript
public createCrossAccountGlueCatalogResourcePolicy(consumerAccountIds: string[], dataCatalogOwnerAccountId: string)
```

###### `consumerAccountIds`<sup>Required</sup> <a name="@randyridgley/cdk-datalake-constructs.DataLake.parameter.consumerAccountIds"></a>

- *Type:* `string`[]

---

###### `dataCatalogOwnerAccountId`<sup>Required</sup> <a name="@randyridgley/cdk-datalake-constructs.DataLake.parameter.dataCatalogOwnerAccountId"></a>

- *Type:* `string`

---

##### `createDownloaderCustomResource` <a name="@randyridgley/cdk-datalake-constructs.DataLake.createDownloaderCustomResource"></a>

```typescript
Expand Down Expand Up @@ -1816,6 +1798,19 @@ Create default Athena workgroup for querying data lake resources.

---

##### `createDefaultDatabse`<sup>Optional</sup> <a name="@randyridgley/cdk-datalake-constructs.DataLakeProperties.property.createDefaultDatabse"></a>

```typescript
public readonly createDefaultDatabse: boolean;
```

- *Type:* `boolean`
- *Default:* false

Create default glue database for the data lake.

---

##### `crossAccountAccess`<sup>Optional</sup> <a name="@randyridgley/cdk-datalake-constructs.DataLakeProperties.property.crossAccountAccess"></a>

```typescript
Expand Down Expand Up @@ -3111,6 +3106,8 @@ public readonly tableName: string;

### DataProduct <a name="@randyridgley/cdk-datalake-constructs.DataProduct"></a>

- *Implements:* [`constructs.IDependable`](#constructs.IDependable)

#### Initializers <a name="@randyridgley/cdk-datalake-constructs.DataProduct.Initializer"></a>

```typescript
Expand Down
1 change: 1 addition & 0 deletions src/data-lake-bucket.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export class DataLakeBucket extends Construct {
});

if (props.crossAccount) {
// TODO: revisit this bucket policy for cross account access.
this.bucket.addToResourcePolicy(
new iam.PolicyStatement({
resources: [
Expand Down
144 changes: 69 additions & 75 deletions src/data-lake-strategy.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import { Connection, ConnectionType, Database } from '@aws-cdk/aws-glue-alpha';
import { NestedStack, Stack } from 'aws-cdk-lib';
import { Aws, NestedStack, Stack } from 'aws-cdk-lib';
import { SecurityGroup, Vpc } from 'aws-cdk-lib/aws-ec2';
import { Rule } from 'aws-cdk-lib/aws-events';
import { LambdaFunction } from 'aws-cdk-lib/aws-events-targets';
import { CfnPermissions, CfnResource } from 'aws-cdk-lib/aws-lakeformation';
import { Function } from 'aws-cdk-lib/aws-lambda';
import { CfnResourceShare } from 'aws-cdk-lib/aws-ram';
import { Bucket } from 'aws-cdk-lib/aws-s3';
import { IDependable } from 'constructs';
import { DataLakeBucket } from './data-lake-bucket';
import { DataProduct } from './data-product';
import { KinesisOps } from './data-streams/kinesis-ops';
import { KinesisStream } from './data-streams/kinesis-stream';
import { CompressionType, S3DeliveryStream } from './data-streams/s3-delivery-stream';
import { GlueCrawler } from './etl/glue-crawler';
import { GlueJob } from './etl/glue-job';
import { GlueJobOps } from './etl/glue-job-ops';
import { GlueTable } from './etl/glue-table';
import { DataPipelineType, DataTier, LakeKind, Permissions, Stage } from './global/enums';
import { DataSetResult } from './global/interfaces';
import { Pipeline } from './pipeline';
import { buildGlueCrawlerName, buildRoleName, buildS3BucketName, packageAsset, toS3Path } from './utils';
import { buildS3BucketName, packageAsset, toS3Path } from './utils';

export interface DataStrategyProps {
readonly stack: Stack;
Expand Down Expand Up @@ -52,83 +52,54 @@ export abstract class LakeImplStrategy {
getDataSetBucketName(pipe: Pipeline, dataTier: DataTier) : string | undefined {
return dataTier == DataTier.RAW ? this.downloadLocations[pipe.name].rawBucketName :
dataTier == DataTier.REFINED ? this.downloadLocations[pipe.name].refinedBucketName :
dataTier == DataTier.TRUSTED ? this.downloadLocations[pipe.name].trustedBucketName : undefined;
dataTier == DataTier.TRUSTED ? this.downloadLocations[pipe.name].trustedBucketName : this.downloadLocations[pipe.name].rawBucketName;
}

createDataProduct(props: DataStrategyProps): void {
const pipelineStack = new NestedStack(props.stack, `${props.pipe.name}-dataset-stack`); // props.product.accountId == Aws.ACCOUNT_ID ? new NestedStack(props.stack, `${props.pipe.name}-dataset-stack`) : props.stack;
// create a nested stack per data product to allow for independent updates
const pipelineStack = new NestedStack(props.stack, `${props.pipe.name}-dataset-stack`);
this.logBucket = props.logBucket;
this.stageName = props.stage;
this.securityGroup = props.securityGroup;
this.vpc = props.vpc;
this.datalakeAdminRoleArn = props.datalakeAdminRoleArn;
this.datalakeDbCreatorRoleArn = props.datalakeDbCreatorRoleArn;

// if data to download into a tier create the download locations
if (props.pipe.dataSetDropTier) {
this.downloadLocations[props.pipe.name] = {
destinationPrefix: props.pipe.destinationPrefix,
sourceBucketName: props.pipe.s3Properties? props.pipe.s3Properties.sourceBucketName! : undefined,
sourceKeys: props.pipe.s3Properties ? props.pipe.s3Properties.sourceKeys! : undefined,
rawBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'raw',
stage: this.stageName,
}),
refinedBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'refined',
stage: this.stageName,
}),
trustedBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'trusted',
stage: this.stageName,
}),
destinationBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: props.pipe.dataSetDropTier == DataTier.RAW ? 'raw' : props.pipe.dataSetDropTier == DataTier.REFINED ? 'refined' : 'trusted',
stage: this.stageName,
}),
};
}
this.createBuckets(pipelineStack, props.pipe, props.product, props.database);

const bucketName = this.getDataSetBucketName(props.pipe, props.pipe.dataSetDropTier)!;
this.addPipeline(pipelineStack, props.pipe, props.product, bucketName);
}

protected createCrawler(stack: Stack, pipe: Pipeline, product: DataProduct,
bucketName: string, s3DataLFResource: CfnResource, database: Database): void {
if (pipe.table !== undefined) return;

const name = bucketName.replace(/\W/g, '');
// only create a crawler for the drop location of the data in the data product of the pipeline
const crawler = new GlueCrawler(stack, `data-lake-crawler-${name}`, {
name: buildGlueCrawlerName({
// create list of data drop locations to use later in the custom resource to download the data
this.downloadLocations[props.pipe.name] = {
destinationPrefix: props.pipe.destinationPrefix,
sourceBucketName: props.pipe.s3Properties? props.pipe.s3Properties.sourceBucketName! : undefined,
sourceKeys: props.pipe.s3Properties ? props.pipe.s3Properties.sourceKeys! : undefined,
rawBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'raw',
stage: this.stageName,
resourceUse: 'crawler',
name: name,
}),
databaseName: product.databaseName,
bucketName: bucketName,
bucketPrefix: pipe.destinationPrefix,
roleName: buildRoleName({
refinedBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'refined',
stage: this.stageName,
resourceUse: 'crawler-role',
name: name,
}),
lfS3Resource: s3DataLFResource,
});
crawler.node.addDependency(database);
trustedBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: 'trusted',
stage: this.stageName,
}),
destinationBucketName: buildS3BucketName({
name: props.pipe.name,
accountId: props.product.accountId,
resourceUse: props.pipe.dataSetDropTier == DataTier.RAW ? 'raw' : props.pipe.dataSetDropTier == DataTier.REFINED ? 'refined' : 'trusted',
stage: this.stageName,
}),
};

this.locationRegistry.forEach(r => {
crawler.node.addDependency(r);
});
this.createTierBucketsAndPermissions(pipelineStack, props.pipe, props.product);

const dataDropBucketName = this.getDataSetBucketName(props.pipe, props.pipe.dataSetDropTier)!;
this.addPipeline(pipelineStack, props.pipe, props.product, dataDropBucketName);
}

protected createGlueTable(stack: Stack, pipeline: Pipeline, product: DataProduct, bucketName: string): void {
Expand Down Expand Up @@ -317,13 +288,14 @@ export abstract class LakeImplStrategy {
}
}

createBuckets(stack: Stack, pipe: Pipeline, product: DataProduct, database: Database): void {
private createTierBucketsAndPermissions(stack: Stack, pipe: Pipeline, product: DataProduct): void {
/// This is confusing. Find a way to simplify
const dataCatalogAccountId = product.dataCatalogAccountId ?
product.dataCatalogAccountId : product.accountId;
const crossAccount = product.dataCatalogAccountId ?
product.dataCatalogAccountId != product.accountId ? true : false : false;

// for each data tier create the appropriate buckets
pipe.tiers.forEach(r => {
const bucketName = this.getDataSetBucketName(pipe, r)!;

Expand All @@ -334,7 +306,7 @@ export abstract class LakeImplStrategy {
logBucket: this.logBucket!,
crossAccount: crossAccount,
s3Properties: product.s3BucketProps,
}).bucket;
});
}

if (this.lakeKind() === LakeKind.CENTRAL_CATALOG || this.lakeKind() === LakeKind.DATA_PRODUCT_AND_CATALOG) {
Expand All @@ -350,10 +322,8 @@ export abstract class LakeImplStrategy {
}

if (product.dataCatalogAccountId != product.accountId) {
this.createDataLocationCrossAccountOwner(stack, `${name}-ca-owner`, product.accountId, product.dataCatalogAccountId!, bucketName, lfResource);
this.createDataLocationCrossAccountOwner(stack, `${name}-xa-owner`, product.accountId, product.dataCatalogAccountId!, bucketName, lfResource);
}

this.createCrawler(stack, pipe, product, bucketName, lfResource, database);
}
});
}
Expand Down Expand Up @@ -416,6 +386,31 @@ class DataProductStrategy extends LakeImplStrategy {
}

addPipeline(stack: Stack, pipeline: Pipeline, dataProduct: DataProduct, bucketName: string): void {
if (pipeline.table) {
this.createGlueTable(stack, pipeline, dataProduct, bucketName);
}

if (dataProduct.dataCatalogAccountId && dataProduct.dataCatalogAccountId != Aws.ACCOUNT_ID) {
// Create the ram share cross account if the product has a cross account GDC
//TODO: create ram share??
new CfnResourceShare(stack, `${pipeline.name}-resource-share`, {
name: `LakeFormation-${pipeline.name}-${dataProduct.dataCatalogAccountId}`,
allowExternalPrincipals: false,
permissionArns: [
'arn:aws:ram::aws:permission/AWSRAMPermissionGlueDatabaseReadWrite',
'arn:aws:ram::aws:permission/AWSRAMPermissionGlueDatabaseReadWriteForCatalog',
'arn:aws:ram::aws:permission/AWSRAMPermissionGlueDatabaseReadWriteForTable',
],
principals: [
dataProduct.dataCatalogAccountId,
],
resourceArns: [
`arn:aws:glue:us-east-1:${Aws.ACCOUNT_ID}:catalog`,
`arn:aws:glue:us-east-1:${Aws.ACCOUNT_ID}:database/${dataProduct.databaseName}`,
`arn:aws:glue:us-east-1:${Aws.ACCOUNT_ID}:table/${dataProduct.databaseName}/*`,
],
});
}
this.createPipelineResources(stack, pipeline, dataProduct, bucketName);
}
}
Expand All @@ -425,10 +420,10 @@ class CentralCatalogStrategy extends LakeImplStrategy {
return LakeKind.CENTRAL_CATALOG;
}

// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
addPipeline(stack: Stack, pipeline: Pipeline, dataProduct: DataProduct, bucketName: string): void {
if (pipeline.table) {
this.createGlueTable(stack, pipeline, dataProduct, bucketName);
}
//
}
}

Expand All @@ -450,11 +445,10 @@ class DataProductAndCatalogStrategy extends LakeImplStrategy {
}

addPipeline(stack: Stack, pipeline: Pipeline, dataProduct: DataProduct, bucketName: string): void {
this.createPipelineResources(stack, pipeline, dataProduct, bucketName);

if (pipeline.table) {
this.createGlueTable(stack, pipeline, dataProduct, bucketName);
}
this.createPipelineResources(stack, pipeline, dataProduct, bucketName);
}
}

Expand Down
19 changes: 16 additions & 3 deletions src/data-lake.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,14 @@ export interface DataLakeProperties {
*
* @default - false
*/
readonly createAthenaWorkgroup?: Boolean;
readonly createAthenaWorkgroup?: boolean;
/**
* Create default glue database for the data lake
*
* @default false
*/
readonly createDefaultDatabse?: boolean;

}

export interface DataTierBucketProps {
Expand Down Expand Up @@ -159,7 +166,7 @@ export class DataLake extends Construct {
this.logBucketProps = {
lifecycleRules: [
{
expiration: Duration.days(30),
expiration: Duration.days(7),
},
],
removalPolicy: RemovalPolicy.DESTROY,
Expand Down Expand Up @@ -254,10 +261,16 @@ export class DataLake extends Construct {
new CfnOutput(this, 'DataLakeAthenaWorkgroup', { value: this.athenaWorkgroup.name });
}

// if there are custom tags passed into the datya lake create them here with a custom resource
// TODO: once Tags are included as part of CFN remove the custom resource.
if (props.policyTags) {
this.createPolicyTagsCustomResource(props.policyTags);
}

if (props.createDefaultDatabse) {
this.createDatabase(`${props.name}-${props.stageName}`);
}

this.dataLakeStrategy = LakeStrategyFactory.getLakeStrategy(props.lakeKind);

if (props.dataProducts && props.dataProducts.length > 0) {
Expand Down Expand Up @@ -374,7 +387,7 @@ export class DataLake extends Construct {
outputs.node.addDependency(this.datalakeAdminRole);
}

public createCrossAccountGlueCatalogResourcePolicy(consumerAccountIds: string[], dataCatalogOwnerAccountId: string) {
protected createCrossAccountGlueCatalogResourcePolicy(consumerAccountIds: string[], dataCatalogOwnerAccountId: string) {
const onCatalogEvent = new PythonFunction(this, 'enable-hybrid-catalog-handler', {
runtime: lambda.Runtime.PYTHON_3_7,
entry: path.join(__dirname, '../lambda/enable-hybrid-catalog'),
Expand Down
3 changes: 2 additions & 1 deletion src/data-product.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as s3 from 'aws-cdk-lib/aws-s3';
import { IDependable } from 'constructs';
import { Pipeline } from './pipeline';

export interface DataProductProperties {
Expand All @@ -10,7 +11,7 @@ export interface DataProductProperties {
readonly s3BucketProps?: s3.BucketProps;
}

export class DataProduct {
export class DataProduct implements IDependable {
readonly accountId: string;
readonly dataCatalogAccountId?: string;
readonly databaseName: string;
Expand Down
Loading

0 comments on commit 97ba939

Please sign in to comment.