diff --git a/CloudFormationDirectDataTemplateLatest.yaml b/CloudFormationDirectDataTemplateLatest.yaml index 12458ad..4e1c397 100644 --- a/CloudFormationDirectDataTemplateLatest.yaml +++ b/CloudFormationDirectDataTemplateLatest.yaml @@ -1,454 +1,423 @@ { - "AWSTemplateFormatVersion": "2010-09-09", - "Metadata": { - "AWS::CloudFormation::Designer": { - "9ca5f848-8195-4390-9090-b0ce8302d4d3": { - "size": { - "width": 140, - "height": 140 - }, - "position": { - "x": 294, - "y": 169 - }, - "z": 0, - "embeds": [] - }, - "490f8729-01cc-4bc0-b9c1-c7d4fdcc4a3c": { - "size": { - "width": 140, - "height": 140 - }, - "position": { - "x": 150, - "y": 170 - }, - "z": 0, - "embeds": [] - }, - "cc139752-432c-4a72-a9d3-8c2947da4c3e": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 450, - "y": 210 - }, - "z": 0, - "embeds": [] - }, - "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 60, - "y": 200 - }, - "z": 0, - "embeds": [] - }, - "ee43fa17-a878-4ec1-b441-cc44f82af366": { - "size": { - "width": 60, - "height": 60 - }, - "position": { - "x": 64.94355492876814, - "y": 108.01864856583974 - }, - "z": 0 - } - } - }, - "Resources": { - "CfDirectDataInternetGateway": { - "Type": "AWS::EC2::InternetGateway", - "Properties": {} - }, - "CfDirectDataVpc": { - "Type": "AWS::EC2::VPC", - "Properties": { - "CidrBlock": "10.0.0.0/16", - "EnableDnsSupport": "true", - "Tags": [ - { - "Key": "Name", - "Value": "cf-direct-data" - } - ] - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "9ca5f848-8195-4390-9090-b0ce8302d4d3" - } - } - }, - "InternetGatewayAttachment": { - "Type": "AWS::EC2::VPCGatewayAttachment", - "Properties": { - "VpcId": { - "Ref": "CfDirectDataVpc" - }, - "InternetGatewayId": { - "Ref": "CfDirectDataInternetGateway" - } - } - }, - "CfDirectDataVpcSubnet": { - "Type": "AWS::EC2::Subnet", - "Properties": { - "CidrBlock": "10.0.0.0/24", - "VpcId": { - "Ref": "CfDirectDataVpc" - }, - "Tags": [ - { - "Key": "Name", - "Value": "cf-direct-data" - } - ] - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "490f8729-01cc-4bc0-b9c1-c7d4fdcc4a3c" - } - } - }, - "CfDirectDataSecurityGroup": { - "Type": "AWS::EC2::SecurityGroup", - "Properties": { - "GroupDescription": "Allows traffic to redshift", - "GroupName": "cf-direct-data-security-group", - "VpcId": { - "Ref": "CfDirectDataVpc" - }, - "SecurityGroupIngress": [ - { - "FromPort": 80, - "IpProtocol": "tcp", - "CidrIp": "0.0.0.0/0", - "ToPort": 80 - }, - { - "FromPort": 5439, - "IpProtocol": "tcp", - "CidrIp": "0.0.0.0/0", - "ToPort": 5439 - } - ] - } - }, - "CfDirectDataS3": { - "Type": "AWS::S3::Bucket", - "Properties": { - "BucketName": "cf-direct-data" - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "cc139752-432c-4a72-a9d3-8c2947da4c3e" - } - } - }, - "CfDirectDataLambdaExecutionRole": { - "Type": "AWS::IAM::Role", - "Properties": { - "RoleName": { - "Fn::Join": [ - "-", - [ - "cf-direct-data-lambda-execution-role", - { - "Ref": "AWS::Region" - } - ] - ] - }, - "AssumeRolePolicyDocument": { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "lambda.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - }, - "ManagedPolicyArns": [ - "arn:aws:iam::aws:policy/AWSBatchFullAccess", - "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", - "arn:aws:iam::aws:policy/SecretsManagerReadWrite", - "arn:aws:iam::aws:policy/AmazonS3FullAccess", - "arn:aws:iam::aws:policy/AWSLambda_FullAccess", - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", - "arn:aws:iam::aws:policy/AmazonRedshiftFullAccess", - "arn:aws:iam::aws:policy/AmazonSQSFullAccess", - "arn:aws:iam::aws:policy/service-role/AWSLambdaSQSQueueExecutionRole" - ] - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8" - } - } - }, - "CfDirectDataEcsTaskExecutionRole": { - "Type": "AWS::IAM::Role", - "Properties": { - "RoleName": { - "Fn::Join": [ - "-", - [ - "cf-direct-data-ecs-task-execution-role", - { - "Ref": "AWS::Region" - } - ] - ] - }, - "AssumeRolePolicyDocument": { - "Version": "2008-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "ecs-tasks.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - }, - "ManagedPolicyArns": [ - "arn:aws:iam::aws:policy/AmazonAPIGatewayInvokeFullAccess", - "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly", - "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", - "arn:aws:iam::aws:policy/AmazonS3FullAccess", - "arn:aws:iam::aws:policy/AWSBatchFullAccess", - "arn:aws:iam::aws:policy/AWSLambda_FullAccess", - "arn:aws:iam::aws:policy/SecretsManagerReadWrite" - ] - } - }, - "CfDirectDataRedshiftRole": { - "Type": "AWS::IAM::Role", - "Properties": { - "RoleName": { - "Fn::Join": [ - "-", - [ - "cf-direct-data-redshift-role", - { - "Ref": "AWS::Region" - } - ] - ] - }, - "AssumeRolePolicyDocument": { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "redshift.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - }, - "ManagedPolicyArns": [ - "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" - ] - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8" - } - } - }, - "CfDirectDataLambda": { - "Type": "AWS::Lambda::Function", - "Properties": { - "Role": { - "Fn::GetAtt": [ - "CfDirectDataLambdaExecutionRole", - "Arn" - ] - }, - "FunctionName": "cf-direct-data", - "Timeout": 900, - "MemorySize": 10240, - "PackageType": "Image", - "Code": { - "ImageUri": { - "Fn::Sub": "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/cf-direct-data:lambda-latest" - } - } - }, - "Metadata": { - "AWS::CloudFormation::Designer": { - "id": "ee43fa17-a878-4ec1-b441-cc44f82af366" - } - } - }, - "CfDirectDataSecret": { - "Type": "AWS::SecretsManager::Secret", - "Properties": { - "Name": "direct-data-config.ini", - "SecretString": "[system]\nyear=2024\n[vault]\nusername=integration.user@cholecap.com\npassword=password\ndns=cholecap.veevavault.com\nversion=v24.1\n[s3]\nbucket_name=cf-direct-data\nstarting_directory=direct-data\n[redshift]\nhost=cf-direct-data.123456abcd.us-east-1.redshift.amazonaws.com\nport=5439\nuser=awsuser\npassword=Passw0rd\ndbname=dev\niam_redshift_s3_read=arn:aws:iam::123456:role/RedshiftS3Read\n[lambda]\nfunction_name=cf-direct-data\n[batch]\njob_name=cf-direct-data\njob_queue=cf-direct-data\njob_definition=cf-direct-data" - } - }, - "CfDirectDataRedshiftClusterSubnet": { - "Type": "AWS::Redshift::ClusterSubnetGroup", - "Properties": { - "Description": "Subnet for direct data cluster", - "SubnetIds": [ - { - "Ref": "CfDirectDataVpcSubnet" - } - ] - } - }, - "CfDirectDataRedshiftCluster": { - "Type": "AWS::Redshift::Cluster", - "Properties": { - "ClusterIdentifier": "cf-direct-data", - "NodeType": "dc2.large", - "MasterUsername": "awsuser", - "MasterUserPassword": "Passw0rd", - "ClusterType": "single-node", - "DBName": "dev", - "IamRoles": [ - { - "Fn::GetAtt": [ - "CfDirectDataRedshiftRole", - "Arn" - ] - }, - { - "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/redshift.amazonaws.com/AWSServiceRoleForRedshift" - } - ], - "ClusterSubnetGroupName": { - "Ref": "CfDirectDataRedshiftClusterSubnet" - }, - "VpcSecurityGroupIds": [ - { - "Ref": "CfDirectDataSecurityGroup" - } - ] - } - }, - "CfDirectDataComputeEnvironment": { - "Type": "AWS::Batch::ComputeEnvironment", - "Properties": { - "ComputeEnvironmentName": "cf-direct-data", - "Type": "MANAGED", - "ServiceRole": { - "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/batch.amazonaws.com/AWSServiceRoleForBatch" - }, - "State": "ENABLED", - "ComputeResources": { - "Type": "FARGATE", - "MaxvCpus": 256, - "Subnets": [ - { - "Ref": "CfDirectDataVpcSubnet" - } - ], - "SecurityGroupIds": [ - { - "Ref": "CfDirectDataSecurityGroup" - } - ] - } - } - }, - "CfDirectDataJobQueue": { - "Type": "AWS::Batch::JobQueue", - "DependsOn": "CfDirectDataComputeEnvironment", - "Properties": { - "ComputeEnvironmentOrder": [ - { - "ComputeEnvironment": { - "Fn::Sub": "arn:${AWS::Partition}:batch:${AWS::Region}:${AWS::AccountId}:compute-environment/cf-direct-data" - }, - "Order": "1" - } - ], - "Priority": 1, - "JobQueueName": "cf-direct-data", - "State": "ENABLED" - } - }, - "CfDirectDataJobDefinition": { - "Type": "AWS::Batch::JobDefinition", - "Properties": { - "Type": "container", - "JobDefinitionName": "cf-direct-data", - "PlatformCapabilities": [ - "FARGATE" - ], - "ContainerProperties": { - "Image": { - "Fn::Sub": "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/cf-direct-data:batch-latest" - }, - "Command": [ - "python", - "run.py", - "--step", - "$STEP", - "--source_filepath", - "$SOURCE_FILEPATH", - "--target_filepath", - "$TARGET_FILEPATH", - "--continue_processing", - "$CONTINUE_PROCESSING", - "--start_time", - "$START_TIME", - "--stop_time", - "$STOP_TIME" - ], - "JobRoleArn": { - "Fn::GetAtt": [ - "CfDirectDataEcsTaskExecutionRole", - "Arn" - ] - }, - "ExecutionRoleArn": { - "Fn::GetAtt": [ - "CfDirectDataEcsTaskExecutionRole", - "Arn" - ] - }, - "ResourceRequirements": [ - { - "Value": "1.0", - "Type": "VCPU" - }, - { - "Value": "8192", - "Type": "MEMORY" - } - ], - "RuntimePlatform": { - "OperatingSystemFamily": "LINUX", - "CpuArchitecture": "X86_64" - }, - "NetworkConfiguration": { - "AssignPublicIp": "ENABLED" - }, - "EphemeralStorage": { - "SizeInGiB": 200 - }, - "LogConfiguration": { - "LogDriver": "awslogs" - } - } - } - } - } -} \ No newline at end of file + "AWSTemplateFormatVersion": "2010-09-09", + "Metadata": + { + "AWS::CloudFormation::Designer": + { + "9ca5f848-8195-4390-9090-b0ce8302d4d3": + { + "size": { "width": 140, "height": 140 }, + "position": { "x": 294, "y": 169 }, + "z": 0, + "embeds": [], + }, + "490f8729-01cc-4bc0-b9c1-c7d4fdcc4a3c": + { + "size": { "width": 140, "height": 140 }, + "position": { "x": 150, "y": 170 }, + "z": 0, + "embeds": [], + }, + "cc139752-432c-4a72-a9d3-8c2947da4c3e": + { + "size": { "width": 60, "height": 60 }, + "position": { "x": 450, "y": 210 }, + "z": 0, + "embeds": [], + }, + "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8": + { + "size": { "width": 60, "height": 60 }, + "position": { "x": 60, "y": 200 }, + "z": 0, + "embeds": [], + }, + "ee43fa17-a878-4ec1-b441-cc44f82af366": + { + "size": { "width": 60, "height": 60 }, + "position": { "x": 64.94355492876814, "y": 108.01864856583974 }, + "z": 0, + }, + }, + }, + "Resources": + { + "CfDirectDataInternetGateway": + { "Type": "AWS::EC2::InternetGateway", "Properties": {} }, + "CfDirectDataVpc": + { + "Type": "AWS::EC2::VPC", + "Properties": + { + "CidrBlock": "10.0.0.0/16", + "EnableDnsSupport": "true", + "Tags": [{ "Key": "Name", "Value": "cf-direct-data" }], + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "9ca5f848-8195-4390-9090-b0ce8302d4d3" }, + }, + }, + "InternetGatewayAttachment": + { + "Type": "AWS::EC2::VPCGatewayAttachment", + "Properties": + { + "VpcId": { "Ref": "CfDirectDataVpc" }, + "InternetGatewayId": { "Ref": "CfDirectDataInternetGateway" }, + }, + }, + "CfDirectDataVpcSubnet": + { + "Type": "AWS::EC2::Subnet", + "Properties": + { + "CidrBlock": "10.0.0.0/24", + "VpcId": { "Ref": "CfDirectDataVpc" }, + "Tags": [{ "Key": "Name", "Value": "cf-direct-data" }], + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "490f8729-01cc-4bc0-b9c1-c7d4fdcc4a3c" }, + }, + }, + "CfDirectDataSecurityGroup": + { + "Type": "AWS::EC2::SecurityGroup", + "Properties": + { + "GroupDescription": "Allows traffic to redshift", + "GroupName": "cf-direct-data-security-group", + "VpcId": { "Ref": "CfDirectDataVpc" }, + "SecurityGroupIngress": + [ + { + "FromPort": 80, + "IpProtocol": "tcp", + "CidrIp": "0.0.0.0/0", + "ToPort": 80, + }, + { + "FromPort": 5439, + "IpProtocol": "tcp", + "CidrIp": "0.0.0.0/0", + "ToPort": 5439, + }, + ], + }, + }, + "CfDirectDataS3": + { + "Type": "AWS::S3::Bucket", + "Properties": + { + "BucketName": + { + "Fn::Sub": "${AWS::AccountId}-${AWS::Region}-cf-direct-data", + }, + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "cc139752-432c-4a72-a9d3-8c2947da4c3e" }, + }, + }, + "CfDirectDataLambdaExecutionRole": + { + "Type": "AWS::IAM::Role", + "Properties": + { + "RoleName": + { + "Fn::Join": + [ + "-", + [ + "cf-direct-data-lambda-execution-role", + { "Ref": "AWS::Region" }, + ], + ], + }, + "AssumeRolePolicyDocument": + { + "Version": "2012-10-17", + "Statement": + [ + { + "Effect": "Allow", + "Principal": { "Service": "lambda.amazonaws.com" }, + "Action": "sts:AssumeRole", + }, + ], + }, + "ManagedPolicyArns": + [ + "arn:aws:iam::aws:policy/AWSBatchFullAccess", + "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", + "arn:aws:iam::aws:policy/SecretsManagerReadWrite", + "arn:aws:iam::aws:policy/AmazonS3FullAccess", + "arn:aws:iam::aws:policy/AWSLambda_FullAccess", + "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + "arn:aws:iam::aws:policy/AmazonRedshiftFullAccess", + "arn:aws:iam::aws:policy/AmazonSQSFullAccess", + "arn:aws:iam::aws:policy/service-role/AWSLambdaSQSQueueExecutionRole", + ], + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8" }, + }, + }, + "CfDirectDataEcsTaskExecutionRole": + { + "Type": "AWS::IAM::Role", + "Properties": + { + "RoleName": + { + "Fn::Join": + [ + "-", + [ + "cf-direct-data-ecs-task-execution-role", + { "Ref": "AWS::Region" }, + ], + ], + }, + "AssumeRolePolicyDocument": + { + "Version": "2008-10-17", + "Statement": + [ + { + "Effect": "Allow", + "Principal": { "Service": "ecs-tasks.amazonaws.com" }, + "Action": "sts:AssumeRole", + }, + ], + }, + "ManagedPolicyArns": + [ + "arn:aws:iam::aws:policy/AmazonAPIGatewayInvokeFullAccess", + "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly", + "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", + "arn:aws:iam::aws:policy/AmazonS3FullAccess", + "arn:aws:iam::aws:policy/AWSBatchFullAccess", + "arn:aws:iam::aws:policy/AWSLambda_FullAccess", + "arn:aws:iam::aws:policy/SecretsManagerReadWrite", + ], + }, + }, + "CfDirectDataRedshiftRole": + { + "Type": "AWS::IAM::Role", + "Properties": + { + "RoleName": + { + "Fn::Join": + [ + "-", + [ + "cf-direct-data-redshift-role", + { "Ref": "AWS::Region" }, + ], + ], + }, + "AssumeRolePolicyDocument": + { + "Version": "2012-10-17", + "Statement": + [ + { + "Effect": "Allow", + "Principal": { "Service": "redshift.amazonaws.com" }, + "Action": "sts:AssumeRole", + }, + ], + }, + "ManagedPolicyArns": + ["arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"], + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "7fe53bfb-ccf0-4a80-b626-ef1b97de3af8" }, + }, + }, + "CfDirectDataLambda": + { + "Type": "AWS::Lambda::Function", + "Properties": + { + "Role": + { "Fn::GetAtt": ["CfDirectDataLambdaExecutionRole", "Arn"] }, + "FunctionName": "cf-direct-data", + "Timeout": 900, + "MemorySize": 10240, + "PackageType": "Image", + "Environment": + { + "Variables": + { + "EXTRACT_SOURCE_CONTENT": "False", + "SECRET_NAME": "direct-data-config.ini", + }, + }, + "Code": + { + "ImageUri": + { + "Fn::Sub": "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/cf-direct-data:lambda-latest", + }, + }, + }, + "Metadata": + { + "AWS::CloudFormation::Designer": + { "id": "ee43fa17-a878-4ec1-b441-cc44f82af366" }, + }, + }, + "CfDirectDataSecret": + { + "Type": "AWS::SecretsManager::Secret", + "Properties": + { + "Name": "direct-data-config.ini", + "SecretString": "[demo]\n;vault\nvault_username=integration.user@cholecap.com\nvault_password=password\nvault_dns=cholecap.veevavault.com\nvault_version=v24.1\n;redshift\nredshift_host=cf-direct-data.123456abcd.us-east-1.redshift.amazonaws.com\nredshift_port=5439\nredshift_user=awsuser\nredshift_password=Passw0rd\nredshift_dbname=dev\nredshift_iam_redshift_s3_read=arn:aws:iam::123456:role/RedshiftS3Read\n;s3\ns3_bucket_name=cf-direct-data\ns3_starting_directory=direct-data\n;batch\njob_name=cf-direct-data\njob_queue=cf-direct-data\njob_definition=cf-direct-data\nlambda_function_name=cf-direct-data", + }, + }, + "CfDirectDataRedshiftClusterSubnet": + { + "Type": "AWS::Redshift::ClusterSubnetGroup", + "Properties": + { + "Description": "Subnet for direct data cluster", + "SubnetIds": [{ "Ref": "CfDirectDataVpcSubnet" }], + }, + }, + "CfDirectDataRedshiftCluster": + { + "Type": "AWS::Redshift::Cluster", + "Properties": + { + "ClusterIdentifier": "cf-direct-data", + "NodeType": "dc2.large", + "MasterUsername": "awsuser", + "MasterUserPassword": "Passw0rd", + "ClusterType": "single-node", + "DBName": "dev", + "IamRoles": + [ + { "Fn::GetAtt": ["CfDirectDataRedshiftRole", "Arn"] }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/redshift.amazonaws.com/AWSServiceRoleForRedshift", + }, + ], + "ClusterSubnetGroupName": + { "Ref": "CfDirectDataRedshiftClusterSubnet" }, + "VpcSecurityGroupIds": [{ "Ref": "CfDirectDataSecurityGroup" }], + }, + }, + "CfDirectDataComputeEnvironment": + { + "Type": "AWS::Batch::ComputeEnvironment", + "Properties": + { + "ComputeEnvironmentName": "cf-direct-data", + "Type": "MANAGED", + "ServiceRole": + { + "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/batch.amazonaws.com/AWSServiceRoleForBatch", + }, + "State": "ENABLED", + "ComputeResources": + { + "Type": "FARGATE", + "MaxvCpus": 256, + "Subnets": [{ "Ref": "CfDirectDataVpcSubnet" }], + "SecurityGroupIds": [{ "Ref": "CfDirectDataSecurityGroup" }], + }, + }, + }, + "CfDirectDataJobQueue": + { + "Type": "AWS::Batch::JobQueue", + "DependsOn": "CfDirectDataComputeEnvironment", + "Properties": + { + "ComputeEnvironmentOrder": + [ + { + "ComputeEnvironment": + { + "Fn::Sub": "arn:${AWS::Partition}:batch:${AWS::Region}:${AWS::AccountId}:compute-environment/cf-direct-data", + }, + "Order": "1", + }, + ], + "Priority": 1, + "JobQueueName": "cf-direct-data", + "State": "ENABLED", + }, + }, + "CfDirectDataJobDefinition": + { + "Type": "AWS::Batch::JobDefinition", + "Properties": + { + "Type": "container", + "JobDefinitionName": "cf-direct-data", + "PlatformCapabilities": ["FARGATE"], + "ContainerProperties": + { + "Image": + { + "Fn::Sub": "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/cf-direct-data:batch-latest", + }, + "Command": + [ + "python", + "run.py", + "--step", + "$STEP", + "--source_filepath", + "$SOURCE_FILEPATH", + "--target_filepath", + "$TARGET_FILEPATH", + "--continue_processing", + "$CONTINUE_PROCESSING", + "--start_time", + "$START_TIME", + "--stop_time", + "$STOP_TIME", + ], + "JobRoleArn": + { + "Fn::GetAtt": ["CfDirectDataEcsTaskExecutionRole", "Arn"], + }, + "ExecutionRoleArn": + { + "Fn::GetAtt": ["CfDirectDataEcsTaskExecutionRole", "Arn"], + }, + "ResourceRequirements": + [ + { "value": "4.0", "type": "VCPU" }, + { "value": "30720", "type": "MEMORY" }, + ], + "RuntimePlatform": + { + "OperatingSystemFamily": "LINUX", + "CpuArchitecture": "X86_64", + }, + "NetworkConfiguration": { "AssignPublicIp": "ENABLED" }, + "EphemeralStorage": { "SizeInGiB": 200 }, + "LogConfiguration": { "LogDriver": "awslogs" }, + }, + }, + }, + }, +} diff --git a/Public Direct Data Lambda API.postman_collection.json b/Public Direct Data Lambda API.postman_collection.json index bc87b9b..61906f3 100644 --- a/Public Direct Data Lambda API.postman_collection.json +++ b/Public Direct Data Lambda API.postman_collection.json @@ -16,7 +16,7 @@ "header": [], "body": { "mode": "raw", - "raw": "{\n \"step\": \"retrieve\", //This command is to invoke the listing and downloading of Direct Data files\n \"start_time\": \"2000-01-01T00:00Z\", //This is start of the time window for the specific direct data file\n \"stop_time\": \"2024-03-11T00:00Z\", //This is stop of the time window for the specific direct data file\n \"extract_type\": \"full\", //This is the type of Direct Data file to be retrieved. The options are full, incremental or log\n \"continue_processing\": false //This determines if you want to perform the full extraction process or stop afte this specific step has been completed.\n}", + "raw": "{\n \"step\": \"retrieve\", //This command is to invoke the listing and downloading of Direct Data files\n \"start_time\": \"2000-01-01T00:00Z\", //This is start of the time window for the specific direct data file\n \"stop_time\": \"2024-03-11T00:00Z\", //This is stop of the time window for the specific direct data file\n \"extract_type\": \"full\", //This is the type of Direct Data file to be retrieved. The options are full, incremental or log\n \"continue_processing\": false, //This determines if you want to perform the full extraction process or stop afte this specific step has been completed.\n \"secret\": \"demo\" //This specifies which secret block to use in the Secrets Manager file declared in the Lambda environment variable\n}", "options": { "raw": { "language": "json" @@ -42,7 +42,7 @@ "header": [], "body": { "mode": "raw", - "raw": "{\n \"step\": \"unzip\", //This denotes that the unzip step will be performed\n \"source_file\": \"direct-data/168629-20240307-0845-N.tar.gz\", //This is the full path in S3 of the file that needs to be unzipped\n \"target_directory\": \"direct-data/168629-20240307-0845-N\", //This is the output directory of where the unzipped contents will be placed\n \"extract_type\": \"incremental\", //This is the type of Direct Data file that is being handled. The options are full, log, or incremental\n \"continue_processing\": false //This determines if you want to perform the full extraction process or stop afte this specific step has been completed.\n}", + "raw": "{\n \"step\": \"unzip\", //This denotes that the unzip step will be performed\n \"source_file\": \"direct-data/168629-20240307-0845-N.tar.gz\", //This is the full path in S3 of the file that needs to be unzipped\n \"target_directory\": \"direct-data/168629-20240307-0845-N\", //This is the output directory of where the unzipped contents will be placed\n \"extract_type\": \"incremental\", //This is the type of Direct Data file that is being handled. The options are full, log, or incremental\n \"continue_processing\": false, //This determines if you want to perform the full extraction process or stop afte this specific step has been completed.\n \"secret\": \"demo\" //This specifies which secret block to use in the Secrets Manager file declared in the Lambda environment variable\n}", "options": { "raw": { "language": "json" @@ -68,7 +68,7 @@ "header": [], "body": { "mode": "raw", - "raw": "{\n \"step\": \"load_data\",\n \"source_file\": \"direct-data/168629-20240307-0845-N\", //This is the directory of where Direct Data that is going to be loaded exists\n \"extract_type\": \"incremental\" //This is the type of Direct Data file that is being handled. The options are full, log, or incremental\n}", + "raw": "{\n \"step\": \"load_data\",\n \"source_file\": \"direct-data/168629-20240307-0845-N\", //This is the directory of where Direct Data that is going to be loaded exists\n \"extract_type\": \"incremental\", //This is the type of Direct Data file that is being handled. The options are full, log, or incremental\n \"secret\": \"demo\" //This specifies which secret block to use in the Secrets Manager file declared in the Lambda environment variable\n}", "options": { "raw": { "language": "json" diff --git a/README.md b/README.md index eef1aec..a868387 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ Note: All resources should be created in the same AWS Region. * Step 1: * _Trusted entity type_: `AWS account` * _An AWS account_: `This account` + * _Use case_: `Cloudformation` * Step 2: Attach the following AWS managed policies. These are required to access Cloudformation and create the direct data resources. * AmazonAPIGatewayAdministrator * AmazonEC2ContainerRegistryFullAccess @@ -109,7 +110,8 @@ Note: All resources should be created in the same AWS Region. ### S3 Bucket * Navigate to the S3 service in the AWS Console -* Search for and select the S3 bucket named `cf-direct-data` +* Search for and select the S3 bucket named `{ACCOUNT_ID}-{REGION}-cf-direct-data` +* Copy the s3 bucket name and note it down separately. This will be used in the Direct Data configuration file * Create a folder at the root of the bucket named `direct-data` ### Redshift Cluster @@ -128,13 +130,15 @@ Note: All resources should be created in the same AWS Region. ### Secrets Manager * Navigate to the Secrets Manager service in the AWS Console * Search for and select the secret named `direct-data-config.ini` -* Select `Retrieve secret value` then `Edit`. Update the following values: - * [vault] username - * [vault] password - * [vault] dns - * [redshift] host (Use the previously copied redshift endpoint. Do not include the port number/database name) - * [redshift] iam_redshift_s3_read (Use the previously copied ARN for `cf-direct-data-redshift-role-{REGION}`) - * [redshift] password (If updated in the previous step) +* Select `Retrieve secret value` then `Edit`. Update the following values under the [demo] section: + * vault_username + * vault_password + * vault_dns + * redshift_host (Use the previously copied redshift endpoint. Do not include the port number/database name) + * redshift_iam_redshift_s3_read (Use the previously copied ARN for `cf-direct-data-redshift-role-{REGION}`) + * redshift_password (If updated in the previous step) + * s3_bucket_name +* Additional sections can be added with different vault and/or AWS services specified for multiple Vault and database functionality. ### VPC * Navigate to the VPC service in the AWS Console @@ -159,7 +163,8 @@ Note: All resources should be created in the same AWS Region. "start_time": "2000-01-01T00:00Z", "stop_time": "2024-04-19T00:00Z", //Update this value to the current date "extract_type": "full", - "continue_processing": true + "continue_processing": true, + "secret": "demo" } ``` * Click `Send` @@ -194,7 +199,8 @@ These schedules should be created after the initial `full` extract is invoked. { "step": "retrieve", "extract_type": "incremental", - "continue_processing": true + "continue_processing": true, + "secret": "demo" } ``` * Select `Next` @@ -208,9 +214,10 @@ These schedules should be created after the initial `full` extract is invoked. * _Payload_: ```json { -"step": "retrieve", -"extract_type": "log", -"continue_processing": true + "step": "retrieve", + "extract_type": "log", + "continue_processing": true, + "secret": "demo" } ``` diff --git a/app/run.py b/app/run.py index c121076..2715a79 100644 --- a/app/run.py +++ b/app/run.py @@ -1,15 +1,12 @@ +import os import sys sys.path.append('.') from datetime import datetime, timezone, timedelta -import json from typing import Dict - -from common.redshift_setup import get_s3_path -from common.aws_utilities import invoke_lambda, start_batch_job -from common.direct_data_files_interface import list_direct_data_files, retrieve_direct_data_files, \ - unzip_direct_data_files, verify_redshift_tables +from common.aws_utilities import start_batch_job +from common.vault.direct_data_files_interface import unzip_direct_data_files from common.integrationConfigClass import IntegrationConfigClass from common.integrationRequestClass import IntegrationRequestClass from common.log_message import log_message @@ -20,16 +17,18 @@ def lambda_handler(event, context): # Initializing an API response response: ResponseClass = ResponseClass(200, '') current_region = context.invoked_function_arn.split(":")[3] + secret_name = os.environ.get("SECRET_NAME") log_message(log_level='Info', message=f'Current region of lambda: {current_region}', context=None) # Retrieving variables from AWS Secrets Manager - settings = IntegrationConfigClass(current_region) + settings = IntegrationConfigClass(current_region, secret_name) # Retrieving input parameters integration_request: IntegrationRequestClass = IntegrationRequestClass(event) step = integration_request.get_step() extract_type = integration_request.get_extract_type() continue_processing = bool(integration_request.get_continue_processing()) + secret = integration_request.get_secret() if continue_processing is None or continue_processing == '': continue_processing = False @@ -38,170 +37,153 @@ def lambda_handler(event, context): message=f'Starting Transaction with {step} step with {extract_type} extract type', context=None) - s3_bucket = settings.config.get("s3", "bucket_name") - s3_directory = settings.config.get("s3", "starting_directory") - function_name = settings.config.get('lambda', 'function_name') + log_message(log_level='Debug', + message=f'Secret name: {secret_name} and Secret block: {secret}', + context=None) + + s3_bucket = settings.config.get(secret, "s3_bucket_name") + s3_directory = settings.config.get(secret, "s3_starting_directory") + job_name = settings.config.get(secret, 'job_name') + job_queue = settings.config.get(secret, 'job_queue') + job_definition = settings.config.get(secret, 'job_definition') + + # Retrieving Direct Data Files if step == "retrieve": start_time = integration_request.get_start_time() stop_time = integration_request.get_stop_time() + # If the start_time and stop_time are empty, set the time difference to 15 minutes for incremental or 1 day + # for log extracts and format appropriately + if (start_time is None or start_time == '') and (stop_time is None or stop_time == ''): + if extract_type == "incremental": + stop_time = datetime.now(timezone.utc) - timedelta(minutes=15) + start_time = (stop_time - timedelta(minutes=15)) + elif extract_type == "log": + stop_time = datetime.now(timezone.utc) - timedelta(days=1) + start_time = (stop_time - timedelta(days=1)) + stop_time = stop_time.strftime("%Y-%m-%dT%H:%MZ") + start_time = start_time.strftime("%Y-%m-%dT%H:%MZ") + else: + start_time = check_time_format(start_time) + stop_time = check_time_format(stop_time) + + # Form job paramters and submit Batch job to unzip the retieved files + job_parameter: Dict[str, str] = {'step': 'retrieve', + 'source_filepath': f'{s3_directory}', + 'continue_processing': 'true', + 'start_time': f'{start_time}', + 'stop_time': f'{stop_time}', + 'extract_type': f'{extract_type}', + 'secret_name': f'{secret_name}', + 'secret': f'{secret}'} + + log_message(log_level='Debug', + message=f'Job Parameters: {job_parameter}', + context=None) + + try: + batch_job_response = start_batch_job(job_name=f'{job_name}-retrieve', job_queue=job_queue, + job_definition=job_definition, + job_parameters=job_parameter) + except Exception as e: + response.set_status_code(500) + response.append_body(f'Error encountered when attempting to stary AWS Batch job: \n{e}') + return response.to_dict() + + response.set_body(f'Starting AWS Batch Job with ID: {batch_job_response["jobName"]}') + + # Unzip previously retrieved Direct Data files + elif step == "unzip": + target_file_path = integration_request.get_target_directory() + source_filepath = integration_request.get_source_filepath() + + # If the extract type is incremental, then just extract in the lambda function, otherwise submit a Batch job + # to unzip. if extract_type == "incremental": + log_message(log_level='Info', + message=f'Unzipping {source_filepath} to {target_file_path}/', + context=None) try: - if (start_time is None or start_time == '') and (stop_time is None or stop_time == ''): - stop_time = datetime.now(timezone.utc) - timedelta(minutes=15) - start_time = (stop_time - timedelta(minutes=15)) - stop_time = stop_time.strftime("%Y-%m-%dT%H:%MZ") - start_time = start_time.strftime("%Y-%m-%dT%H:%MZ") - else: - start_time = check_time_format(integration_request.get_start_time()) - stop_time = check_time_format(integration_request.get_stop_time()) - log_message(log_level='Info', - message=f'Start time: {start_time} and stop time: {stop_time}', - context=None) - list_of_direct_data_files_response = list_direct_data_files(start_time=start_time, - stop_time=stop_time, - extract_type=f'{extract_type}_directdata') + successful_unzip = unzip_direct_data_files(s3_bucket, source_filepath, + f'{integration_request.get_target_directory()}/') except Exception as e: response.set_status_code(500) - response.append_body(f'Error when trying to list direct data files:\n{e}') + response.append_body(f'Errors encountered when attempting to unzip files {source_filepath}\n{e}') return response.to_dict() - file_paths_retrieved: Dict[str, str] = {} - - if list_of_direct_data_files_response.is_successful() and bool(list_of_direct_data_files_response.data): - for file in list_of_direct_data_files_response.data: - log_message(log_level='Info', - message=f'Iterating over direct data file list', - context=None) - log_message(log_level='Debug', - message=f'File name: {file.name} and record count: {file.record_count}', - context=None) - if file.record_count > 0: - file_paths_retrieved[file.name] = file.filename - else: - response.set_body("Nothing was returned when attempting to list the direct data files or there is an " - "issue with the response") - if len(file_paths_retrieved) > 0: - for file_path_name, file_name in file_paths_retrieved.items(): + if continue_processing: + # If the unzipping is successful, form job parameters and submit a Batch job to load the data + if successful_unzip: + + job_parameter: Dict[str, str] = {'step': 'load_data', + 'source_filepath': f'{target_file_path}', + 'extract_type': f'{extract_type}', + 'secret_name': f'{secret_name}', + 'secret': f'{secret}'} + try: - retrieval_success = retrieve_direct_data_files( - list_files_response=list_of_direct_data_files_response, - bucket_name=s3_bucket, - starting_directory=f'{s3_directory}/{file_name}') + start_batch_job(job_name=f'{job_name}-load', job_queue=job_queue, + job_definition=job_definition, + job_parameters=job_parameter) except Exception as e: response.set_status_code(500) - response.append_body(f'Error when trying to download direct data files: \n{e}') + response.append_body(f'Error encountered when attempting to starting AWS Batch job: \n{e}') return response.to_dict() - if continue_processing and retrieval_success: - payload: Dict[str, str] = {'step': 'unzip', - 'source_filepath': f'{s3_directory}/{file_name}', - 'target_filepath': f'{s3_directory}/{file_path_name}', - 'continue_processing': f'{continue_processing}'} - - try: - invoke_lambda(function_name=function_name, payload=json.dumps(payload)) - except Exception as e: - response.set_status_code(500) - response.append_body(f'Error encountered when invoking AWS Lambda: \n{e}') - return response.to_dict() - response.append_body(f'Invoking AWS Lambda: {function_name} to unzip the retrieved files') - else: - response.set_body('No updates to be made') else: - if start_time is not None and start_time != '': - start_time = check_time_format(integration_request.get_start_time()) - else: - start_time = '' - if stop_time is not None and stop_time != '': - stop_time = check_time_format(integration_request.get_stop_time()) - else: - stop_time = '' - job_name = settings.config.get('batch', 'job_name') - job_queue = settings.config.get('batch', 'job_queue') - job_definition = settings.config.get('batch', 'job_definition') - job_parameter: Dict[str, str] = {'step': 'retrieve', - 'source_filepath': f'{s3_directory}', - 'continue_processing': 'true', - 'start_time': f'{start_time}', - 'stop_time': f'{stop_time}', - 'extract_type': f'{extract_type}'} + + job_parameter: Dict[str, str] = {'step': 'unzip', + 'source_filepath': f'{source_filepath}', + 'target_directory': f'{target_file_path}', + 'extract_type': f'{extract_type}', + 'continue_processing': f'{continue_processing}', + 'secret_name': f'{secret_name}', + 'secret': f'{secret}'} log_message(log_level='Debug', message=f'Job Parameters: {job_parameter}', context=None) + try: - batch_job_response = start_batch_job(job_name=f'{job_name}-retrieve', job_queue=job_queue, + batch_job_response = start_batch_job(job_name=f'{job_name}-unzip', job_queue=job_queue, job_definition=job_definition, job_parameters=job_parameter) + log_message(log_level='Info', + message=f'Starting {job_name} with ID: {batch_job_response["jobId"]} to unzip files', + context=None) + except Exception as e: response.set_status_code(500) - response.append_body(f'Error encountered when attempting to stary AWS Batch job: \n{e}') + response.append_body(f'Error encountered when attempting to starting AWS Batch job: \n{e}') return response.to_dict() - response.set_body(f'Starting AWS Batch Job with ID: {batch_job_response["jobName"]}') + # Load the extracted Direct Data files into a specified Redshift database via a Batch job + elif step == "load_data": + + source_filepath = integration_request.get_source_filepath() + extract_source_content = os.environ.get("EXTRACT_SOURCE_CONTENT") - elif step == "unzip": - target_file_path = integration_request.get_target_directory() - source_filepath = integration_request.get_source_file() log_message(log_level='Info', - message=f'Unzipping {source_filepath} to {target_file_path}/', + message=f'Loading data from {source_filepath}', context=None) + + job_parameter: Dict[str, str] = {'step': 'load_data', + 'source_filepath': f'{source_filepath}', + 'extract_source_content': f'{extract_source_content}', + 'extract_type': f'{extract_type}', + 'secret_name': f'{secret_name}', + 'secret': f'{secret}'} + try: - successful_unzip = unzip_direct_data_files(s3_bucket, source_filepath, - f'{integration_request.get_target_directory()}/') - except Exception as e: - response.set_status_code(500) - response.append_body(f'Errors encountered when attempting to unzip files {source_filepath}\n{e}') - return response.to_dict() - if continue_processing: - if successful_unzip: - function_name = settings.config.get('lambda', 'name') - payload: Dict[str, str] = {'step': 'load_data', - 'source_file': f'{target_file_path}', - 'extract_type': 'incremental'} - - try: - invoke_lambda(function_name=function_name, payload=json.dumps(payload)) - response.append_body(f'Invoking AWS Lambda: {function_name} to load the unzipped files') - except Exception as e: - response.set_status_code(500) - response.append_body(f'Error encountered when invoking AWS Lambda: \n{e}') - return response.to_dict() - elif step == "load_data": - source_filepath = integration_request.get_source_file() - vault_id = source_filepath.split("/")[-1].split("-")[0] - schema_name = f'vault_{vault_id}' - try: - manifest_filepath = get_s3_path("manifest", s3_bucket, source_filepath) - log_message(log_level='Debug', - message=f'The manifest file: {manifest_filepath}', - context=None) - metadata_filepath = get_s3_path("metadata.csv", s3_bucket, source_filepath) - if metadata_filepath is None or not metadata_filepath.strip(): - metadata_filepath = get_s3_path("metadata_full.csv", s3_bucket, source_filepath) - log_message(log_level='Debug', - message=f'The metadata file: {metadata_filepath}', + log_message(log_level='Info', + message=f'Starting {job_name}-load job in the {job_queue} with {job_definition} definition and {job_parameter} parameters', context=None) - metadata_deletes_filepath = get_s3_path("metadata_deletes.csv", s3_bucket, source_filepath) + start_batch_job(job_name=f'{job_name}-load', job_queue=job_queue, + job_definition=job_definition, + job_parameters=job_parameter) except Exception as e: response.set_status_code(500) - response.append_body(f'Errors encountered when search for files in S3: \n{e}') + response.append_body(f'Error encountered when attempting to starting AWS Batch job: \n{e}') return response.to_dict() - try: - verify_redshift_tables(chunk_size=500, - bucket_name=s3_bucket, - manifest_path=manifest_filepath, - metadata_path=metadata_filepath, - starting_directory=source_filepath, - extract_type=extract_type, - metadata_deletes_filepath=metadata_deletes_filepath, - schema_name=schema_name) - except Exception as e: - response.set_status_code(500) - response.append_body(f'Errors encountered when attempting to load the data:\n{e}') - return response.to_dict() - - response.append_body('Successfully loaded Vault Direct Data into Redshift') response.append_body(f'Completed {step} step.') return response.to_dict() diff --git a/common/api/model/component/document.py b/common/api/model/component/document.py new file mode 100644 index 0000000..5b27bbc --- /dev/null +++ b/common/api/model/component/document.py @@ -0,0 +1,65 @@ +from __future__ import annotations +from dataclasses import field +from typing import List + +from pydantic.dataclasses import dataclass + +from ..vault_model import VaultModel + + +@dataclass(config=dict(extra="allow")) +class Document(VaultModel): + """ + Model for the Document object in the response. + + """ + + id: int = None + version_id: str = None + major_version_number__v: int = None + minor_version_number__v: int = None + annotations_all__v: int = None + annotations_anchors__v: int = None + annotations_lines__v: int = None + annotations_links__v: int = None + annotations_notes__v: int = None + annotations_resolved__v: int = None + annotations_unresolved__v: int = None + archive__v: bool = None + binder__v: bool = None + bound_source_major_version__v: int = None + bound_source_minor_version__v: int = None + classification__v: str = None + created_by__v: int = None + crosslink__v: bool = None + description__v: str = None + document_creation_date__v: str = None + document_number__v: str = None + external_id__v: str = None + filename__v: str = None + format__v: str = None + latest_source_major_version__v: int = None + latest_source_minor_version__v: int = None + last_modified_by__v: int = None + lifecycle__v: str = None + link_status__v: List[str] = field(default_factory=list) + locked__v: bool = None + md5checksum__v: str = None + name__v: str = None + pages__v: int = None + size__v: int = None + source_binding_rule__v: List[str] = field(default_factory=list) + source_document_id__v: int = None + source_document_name__v: str = None + source_document_number__v: str = None + source_owner__v: int = None + source_vault_id__v: int = None + source_vault_name__v: str = None + status__v: str = None + subtype__v: str = None + suppress_rendition__v: str = None + title__v: str = None + type__v: str = None + version_created_by__v: int = None + version_creation_date__v: str = None + version_modified_date__v: str = None \ No newline at end of file diff --git a/common/api/model/component/document_field.py b/common/api/model/component/document_field.py new file mode 100644 index 0000000..0400c01 --- /dev/null +++ b/common/api/model/component/document_field.py @@ -0,0 +1,55 @@ +from __future__ import annotations +from dataclasses import field +from typing import List + +from pydantic.dataclasses import dataclass + +from ..vault_model import VaultModel + + +@dataclass(config=dict(extra="allow")) +class DocumentField(VaultModel): + """ + Model for the Document Field object in the response. + + Attributes: + required (bool): When true, the field value must be set when creating new documents. + editable (bool): When true, the field value can be defined by the currently authenticated user. When false, the field value is read-only or system-managed, + or the current user does not have adequate permissions to edit this field. + setOnCreateOnly (bool): When true, the field value can only be set once (when creating new documents). + hidden: (bool): Boolean indicating field availability to the UI. When true, the field is never available to nor visible in the UI. When false, the field is always available to the UI + but visibility to users is subject to field-level security overrides. + queryable (bool): When true, field values can be retrieved using VQL. + noCopy (bool): When true, field values are not copied when using the Make a Copy action. + facetable (bool): When true, the field is available for use as a faceted filter in the Vault UI. + """ + + definedIn: str = None + definedInType: str = None + disabled: bool = None + editable: bool = None + facetable: bool = None + helpContent: str = None + hidden: bool = None + label: str = None + maxLength: int = None + maxValue: int = None + minValue: int = None + name: str = None + noCopy: bool = None + queryable: bool = None + repeating: bool = None + required: bool = None + scope: str = None + section: str = None + sectionPosition: int = None + setOnCreateOnly: bool = None + shared: bool = None + systemAttribute: bool = None + type: str = None + usedIn: List[UsedIn] = field(default_factory=list) + + @dataclass + class UsedIn: + key: str = None + type: str = None diff --git a/common/api/model/component/job.py b/common/api/model/component/job.py new file mode 100644 index 0000000..44f148f --- /dev/null +++ b/common/api/model/component/job.py @@ -0,0 +1,25 @@ +from pydantic import Field +from pydantic.dataclasses import dataclass +from typing import List + +from ..vault_model import VaultModel + +@dataclass +class Link(VaultModel): + rel: str = None + href: str = None + method: str = None + accept: str = None + + +@dataclass +class Job(VaultModel): + created_by: int = None + created_date: str = None + id: int = None + method: str = None + run_end_date: str = None + run_start_date: str = None + status: str = None + title: str = None + links: List[Link] = Field(default_factory=list) diff --git a/common/api/model/response/document_response.py b/common/api/model/response/document_response.py new file mode 100644 index 0000000..d39827c --- /dev/null +++ b/common/api/model/response/document_response.py @@ -0,0 +1,384 @@ +""" +Module that defines classes used to represent responses from the MDL endpoints. +""" +from __future__ import annotations + +from typing import List + +from pydantic import Field +from pydantic.dataclasses import dataclass + +from .vault_response import VaultResponse +from ..component.document import Document +from ..component.document_field import DocumentField +from ..vault_model import VaultModel + + +@dataclass +class DocumentFieldResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve All Document Fields
+ Retrieve Common Document Fields + + Attributes: + properties (List[DocumentField]): The list of document fields. + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/properties
+ POST /api/{version}/metadata/objects/documents/properties/find_common + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-document-fields](https://developer.veevavault.com/api/24.1/#retrieve-all-document-fields) + [https://developer.veevavault.com/api/24.1/#retrieve-common-document-fields](https://developer.veevavault.com/api/24.1/#retrieve-common-document-fields) + """ + + properties: List[DocumentField] = Field(default_factory=list) + + +@dataclass +class DocumentTypesResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve All Document Types + + Attributes: + types (List[DocumentType]): List of all standard and custom document types in your Vault. + lock (str): URL to retrieve the document lock metadata (document check-out). + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-document-types](https://developer.veevavault.com/api/24.1/#retrieve-all-document-types) + """ + + types: List[DocumentType] = Field(default_factory=list) + lock: str = None + + @dataclass + class DocumentType(VaultModel): + """ + Model for the Document Type object in the response. + + Attributes: + label (str): Label of each document type as seen in the API and UI. + value (str): URL to retrieve the metadata associated with each document type. + """ + + label: str = None + value: str = None + + +@dataclass +class DocumentTypeHeirarchyResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve Document Type
+ Retrieve Document Subtype
+ Retrieve Document Classification + + Attributes: + name (str): Name of the document type. Used primarily in the API. + label (str): Label of the document type as seen in the API and UI. + properties (List[DocumentField]): List of all the document fields associated to the document type. + renditions (List[str]): List of all rendition types available. + relationshipTypes (List[RelationshipType]): List of all relationship types available. + templates (List[Template]): List of all templates available (when configured). + availableLifecycles (List[Lifecycle]): List of all lifecycles available. + subtypes (List[Subtype]): List of all document subtypes available for the document type. + classifications (List[Classification]): List of all document classifications available for the document subtype. + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types/{type}
+ GET /api/{version}/metadata/objects/documents/types/{type}/subtypes/{subtype}
+ GET /api/{version}/metadata/objects/documents/types/{type}/classifications/{classification} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-type](https://developer.veevavault.com/api/24.1/#retrieve-document-type) + [https://developer.veevavault.com/api/24.1/#retrieve-document-subtype](https://developer.veevavault.com/api/24.1/#retrieve-document-subtype) + [https://developer.veevavault.com/api/24.1/#retrieve-document-classification](https://developer.veevavault.com/api/24.1/#retrieve-document-classification) + """ + name: str = None + label: str = None + properties: List[DocumentField] = Field(default_factory=list) + renditions: List[str] = Field(default_factory=list) + relationshipTypes: List[RelationshipType] = Field(default_factory=list) + templates: List[Template] = Field(default_factory=list) + availableLifecycles: List[Lifecycle] = Field(default_factory=list) + subtypes: List[Subtype] = Field(default_factory=list) + classifications: List[Classification] = Field(default_factory=list) + + @dataclass + class RelationshipType(VaultModel): + """ + Model for the Relationship Type object in the response. + + Attributes: + label (str): Label of relationship type. + value (str): URL to retrieve the metadata associated with each relationship type. + """ + label: str = None + value: str = None + + @dataclass + class Template(VaultModel): + """ + Model for the Template object in the response. + + Attributes: + label (str): Label of template. + name (str): Name of template. + kind (str): Kind of template. + definedIn (str): Defined in. + definedInType (str): Defined in type. + """ + + label: str = None + name: str = None + kind: str = None + definedIn: str = None + definedInType: str = None + + @dataclass + class Lifecycle(VaultModel): + """ + Model for the Lifecycle object in the response. + + Attributes: + name (str): Name of lifecycle. + label (str): Label of lifecycle. + + """ + name: str = None + label: str = None + + @dataclass + class Subtype(VaultModel): + """ + Model for the Subype object in the response. + + Attributes: + label (str): Label of subtype. + value (str): URL to retrieve the metadata associated with each subtype. + """ + label: str = None + value: str = None + + @dataclass + class Classification(VaultModel): + """ + Model for the Classification object in the response. + + Attributes: + label (str): Label of subtype. + value (str): URL to retrieve the metadata associated with each subtype. + """ + label: str = None + value: str = None + + +@dataclass +class DocumentsResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve All Documents + + Attributes: + documents (List[DocumentNode]): The list of document nodes. + + Vault API Endpoint: + GET /api/{version}/objects/documents + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-documents](https://developer.veevavault.com/api/24.1/#retrieve-all-documents) + """ + documents: List[DocumentNode] = None + + @dataclass + class DocumentNode(VaultModel): + """ + Model for the Document Node object in the response. + + Attributes: + document (Document): The document object. + """ + + document: Document = None + + +@dataclass +class DocumentResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve Document
+ Retrieve Document Version + + Attributes: + document (Document): The document object. + renditions (List[str]): List of all rendition types available. + versions (List[Version]): List of all versions available. + attachments (List[Attachment]): List of all attachments available. + id (int): ID of the document. (Only returned for create, delete, and update) + external_id__v (str): External ID. (Only returned for create, delete, and update) + major_version_number__v (int): Major version number. (Only returned for create, delete, and update) + minor_version_number__v (int): Minor version number. (Only returned for create, delete, and update) + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document](https://developer.veevavault.com/api/24.1/#retrieve-document) + """ + document: Document = None + renditions: Renditions = Field(default_factory=list) + versions: List[Version] = Field(default_factory=list) + attachments: List[Attachment] = Field(default_factory=list) + + # ------------------------------------------------------------ + # Special case: when creating, deleting, and updating docs, + # they do not return a document node. The id, major/minor, + # and external_id__v are at the root. + + id: int = None + external_id__v: str = None + major_version_number__v: int = None + minor_version_number__v: int = None + + @dataclass + class Attachment(VaultModel): + """ + Model for the Attachment object in the response. + + Attributes: + id (str): ID of attachment. + url (str): URL to retrieve the metadata associated with each attachment. + """ + + id: str = None + url: str = None + + +@dataclass +class DocumentVersionsResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve Document Versions + + Attributes: + versions (List[Version]): List of all versions available. + renditions (Renditions): Renditions object that contains available renditions. + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/versions + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-versions](https://developer.veevavault.com/api/24.1/#retrieve-document-versions) + """ + versions: List[Version] = None + renditions: Renditions = Field(default_factory=list) + + +@dataclass +class DocumentBulkResponse(VaultResponse): + """ + Model for the following API calls responses: + + Create Multiple Documents + + Attributes: + data (List[DocumentResponse]): List of all document responses. + + Vault API Endpoint: + POST /api/{version}/objects/documents/batch + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#create-multiple-documents](https://developer.veevavault.com/api/24.1/#create-multiple-documents) + """ + data: List[DocumentResponse] = Field(default_factory=list) + + def has_errors(self) -> bool: + if super().has_errors(): + return True + + document_responses = self.get_data() + if document_responses is None or len(document_responses) == 0: + return True + else: + for document_response in document_responses: + if document_response.has_errors(): + return True + + return False + + +@dataclass +class DocumentExportResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve Document Export Results + + Attributes: + data (List[ExportedDocument]): List of all exported documents. + + Vault API Endpoint: + GET /api/{version}/objects/documents/batch/actions/fileextract/{jobid}/results + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-export-results](https://developer.veevavault.com/api/24.1/#retrieve-document-export-results) + """ + data: List[ExportedDocument] = Field(default_factory=list) + + @dataclass + class ExportedDocument(VaultModel): + """ + Model for the Exported Document object in the response. + + Attributes: + responseStatus (str): Status of the exported document. + id (int): ID of the document. + major_version_number__v (int): Major version number of the document. + minor_version_number__v (int): Minor version number of the document. + file (str): The path on the file staging server. + user_id__v (int): The id value of the Vault user who initiated the document export job. + """ + + responseStatus: str = None + id: int = None + major_version_number__v: int = None + minor_version_number__v: int = None + file: str = None + user_id__v: int = None + + +@dataclass(config=dict(extra="allow")) +class Renditions(VaultModel): + """ + Model for the Renditions object in the response. + + Attributes: + viewable_rendition__v (str): URL to retrieve the viewable rendition. + """ + + viewable_rendition__v: str = None + + +@dataclass +class Version(VaultModel): + """ + Model for the Version object in the response. + + Attributes: + number (str): Version number. + value (str): URL to retrieve the metadata associated with each version. + """ + + number: str = None + value: str = None diff --git a/common/api/model/response/file_staging_response.py b/common/api/model/response/file_staging_response.py new file mode 100644 index 0000000..dee315e --- /dev/null +++ b/common/api/model/response/file_staging_response.py @@ -0,0 +1,258 @@ +""" +Module that defines classes used to represent responses from the File Staging endpoints. +""" +from __future__ import annotations + +from typing import List + +from pydantic import Field +from pydantic.dataclasses import dataclass + +from .vault_response import VaultResponse +from ..vault_model import VaultModel + + +@dataclass +class FileStagingItemResponse(VaultResponse): + """ + Model for the following API calls responses: + + Create Folder or File + + Attributes: + data (FileStagingItem): File staging item + + Vault API Endpoint: + POST /api/{version}/services/file_staging/items + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#create-folder-or-file](https://developer.veevavault.com/api/24.2/#create-folder-or-file) + """ + + data: FileStagingItem = None + + +@dataclass +class FileStagingItemBulkResponse(VaultResponse): + """ + Model for the following API calls responses: + + List Items at a Path + + Attributes: + data (List[FileStagingItem]): List of file staging items + responseDetails (ResponseDetails): Response details + + Vault API Endpoint: + GET /api/{version}/services/file_staging/items/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-items-at-a-path](https://developer.veevavault.com/api/24.2/#list-items-at-a-path) + """ + + data: List[FileStagingItem] = Field(default_factory=list) + responseDetails: ResponseDetails = Field(default=None) + + def is_paginated(self) -> bool: + """ + Check if response is paginated + + Returns: + bool: True if there is a next page of results + """ + + if self.responseDetails is not None and self.responseDetails.next_page is not None: + return True + + return False + + +@dataclass +class FileStagingSessionResponse(VaultResponse): + """ + Model for the following API calls responses: + + Create Resumable Upload Session
+ Get Upload Session Details + + Attributes: + data (ResumableUploadSession): Upload session + + Vault API Endpoint: + POST /api/{version}/services/file_staging/upload
+ GET /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#create-resumable-upload-session](https://developer.veevavault.com/api/24.2/#create-resumable-upload-session)
+ [https://developer.veevavault.com/api/24.2/#get-upload-session-details](https://developer.veevavault.com/api/24.2/#get-upload-session-details) + """ + + data: ResumableUploadSession = None + + +@dataclass +class FileStagingSessionBulkResponse(VaultResponse): + """ + Model for the following API calls responses: + + List Upload Sessions + + Attributes: + data (List[ResumableUploadSession]): List of upload sessions + responseDetails (ResponseDetails): Response details + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-upload-sessions](https://developer.veevavault.com/api/24.2/#list-upload-sessions) + """ + + data: List[ResumableUploadSession] = Field(default_factory=list) + responseDetails: ResponseDetails = None + + def is_paginated(self) -> bool: + """ + Check if response is paginated + + Returns: + bool: True if there is a next page of results + """ + + if self.responseDetails is not None and self.responseDetails.next_page is not None: + return True + + return False + + +@dataclass +class FileStagingSessionPartResponse(VaultResponse): + """ + Model for the following API calls responses: + + Upload to a Session + + Attributes: + data (ResumableUploadSessionPart): Upload session part + + Vault API Endpoint: + PUT /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#upload-to-a-session](https://developer.veevavault.com/api/24.2/#upload-to-a-session) + """ + + data: ResumableUploadSessionPart = None + + +@dataclass +class FileStagingSessionPartBulkResponse(VaultResponse): + """ + Model for the following API calls responses: + + List File Parts Uploaded to Session + + Attributes: + data (List[ResumableUploadSessionPart]): List of uploaded parts + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload/{upload_session_id}/parts + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-file-parts-uploaded-to-session](https://developer.veevavault.com/api/24.2/#list-file-parts-uploaded-to-session) + """ + + data: List[ResumableUploadSessionPart] = Field(default_factory=list) + + +@dataclass +class FileStagingJobResponse(VaultResponse): + """ + Model for the following API calls responses: + + Update Folder or File
+ Delete Folder or File
+ Commit Upload Session + + Attributes: + data (Job): Job + + Vault API Endpoint: + PUT /api/{version}/services/file_staging/items/{item}
+ DELETE /api/{version}/services/file_staging/items/{item}
+ POST /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#update-folder-or-file](https://developer.veevavault.com/api/24.2/#update-folder-or-file)
+ [https://developer.veevavault.com/api/24.2/#delete-file-or-folder](https://developer.veevavault.com/api/24.2/#delete-file-or-folder)
+ [https://developer.veevavault.com/api/24.2/#commit-upload-session](https://developer.veevavault.com/api/24.2/#commit-upload-session) + """ + + data: Job = None + + @dataclass + class Job(VaultModel): + """ + Model for the data object in the response + + Attributes: + job_id (int): Job ID + url (str): URL of the job + """ + + job_id: int = None + url: str = None + + +@dataclass +class ResponseDetails(VaultModel): + """ + Model for the response details object in the response. + + Attributes: + next_page (str): The next page of results + """ + + next_page: str = None + + +@dataclass +class FileStagingItem(VaultModel): + """ + Model for the data objects in the response + + Attributes: + kind (str): file/folder + path (str): Path of the file/folder + name (str): Name of the file/folder + size (int): Size of the file + modified_date (str): Modified date of the file + """ + + kind: str = None + path: str = None + name: str = None + size: int = None + modified_date: str = None + + +@dataclass +class ResumableUploadSession(VaultModel): + created_date: str = None + expiration_date: str = None + owner: int = None + id: str = None + last_uploaded_date: str = None + path: str = None + size: int = None + uploaded_parts: int = None + uploaded: int = None + name: str = None + overwrite: bool = None + + +@dataclass +class ResumableUploadSessionPart(VaultModel): + part_number: int = None + size: int = None + part_content_md5: str = None diff --git a/common/api/model/response/jobs_response.py b/common/api/model/response/jobs_response.py new file mode 100644 index 0000000..f176d20 --- /dev/null +++ b/common/api/model/response/jobs_response.py @@ -0,0 +1,51 @@ +""" +Module that defines classes used to represent responses from the Jobs endpoints. +""" +from __future__ import annotations + +from pydantic.dataclasses import dataclass + +from .vault_response import VaultResponse +from ..component.job import Job + + +@dataclass +class JobStatusResponse(VaultResponse): + """ + Model for the following API calls responses: + + Retrieve Job Status + + Attributes: + data (Job): Job + + Vault API Endpoint: + GET /api/{version}/services/jobs/{job_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#retrieve-job-status](https://developer.veevavault.com/api/24.2/#retrieve-job-status) + """ + + data: Job = None + + +@dataclass +class JobCreateResponse(VaultResponse): + """ + Model for the following API calls responses: + + Export Documents + + Attributes: + url (str): URL to retrieve the current job status of the document export request. + job_id (int): The Job ID value to retrieve the status and results of the document export request. + + Vault API Endpoint: + POST /api/{version}/objects/documents/batch/actions/fileextract + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#export-documents-1](https://developer.veevavault.com/api/24.1/#export-documents-1) + """ + + url: str = None + job_id: int = None diff --git a/common/api/request/document_request.py b/common/api/request/document_request.py new file mode 100644 index 0000000..a366e1f --- /dev/null +++ b/common/api/request/document_request.py @@ -0,0 +1,927 @@ +""" +Module that defines classes used to send Document requests to the Vault API. +""" +from enum import Enum +from typing import Set + +from ..connector import http_request_connector +from ..connector.http_request_connector import HttpMethod +from ..model.component.document import Document +from ..model.response.document_response import DocumentFieldResponse, DocumentsResponse, DocumentResponse, \ + DocumentExportResponse, DocumentBulkResponse +from ..model.response.document_response import DocumentTypeHeirarchyResponse +from ..model.response.document_response import DocumentTypesResponse +from ..model.response.document_response import DocumentVersionsResponse +from ..model.response.jobs_response import JobCreateResponse +from ..model.response.vault_response import VaultResponse +from ..request.vault_request import VaultRequest, _ResponseOption, _RequestOption + + +class NamedFilter(Enum): + """ + Enumeration class representing Retrieve all document named filters. + + Attributes: + CART (str): Retrieves only documents in your cart. + FAVORITES (str): Retrieves only documents which you have marked as favorites in the library. + RECENT_DOCUMENTS (str): Retrieves only documents which you have recently accessed. + MY_DOCUMENTS (str): Retrieves only documents which you have created. + """ + + CART: str = 'Cart' + FAVORITES: str = 'Favorites' + RECENT_DOCUMENTS: str = 'Recent Documents' + MY_DOCUMENTS: str = 'My Documents' + + +class Scope(Enum): + """ + Enumeration class representing Retrieve all document scopes. + + Attributes: + ALL (str): Searches both within the document content and searchable document fields. + CONTENTS (str): Searches only within the document content. + """ + + ALL: str = 'all' + CONTENTS: str = 'contents' + + +class VersionsScope(Enum): + """ + Enumeration class representing Retrieve all document versions scopes. + + Attributes: + ALL (str): Retrieves all document versions, rather than only the latest version. + """ + + ALL: str = 'all' + + +class DocumentRequest(VaultRequest): + """ + Class that defines methods used to call Documents endpoints. + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#documents](https://developer.veevavault.com/api/24.1/#documents) + """ + + _URL_DOC_ALL_FIELDS: str = '/metadata/objects/documents/properties' + _URL_DOC_COMMON_FIELDS: str = '/metadata/objects/documents/properties/find_common' + _URL_DOC_TYPES: str = '/metadata/objects/documents/types' + _URL_DOC_TYPE: str = '/metadata/objects/documents/types/{type}' + _URL_DOC_SUBTYPE: str = '/metadata/objects/documents/types/{type}/subtypes/{subtype}' + _URL_DOC_CLASSIFICATION: str = '/metadata/objects/documents/types/{type}/subtypes/{subtype}/classifications/{classification}' + _URL_DOCS: str = '/objects/documents' + _URL_DOC: str = '/objects/documents/{doc_id}' + _URL_DOC_VERSIONS: str = '/objects/documents/{doc_id}/versions' + _URL_DOC_VERSION: str = '/objects/documents/{doc_id}/versions/{major_version}/{minor_version}' + _URL_DOC_FILE: str = '/objects/documents/{doc_id}/file' + _URL_DOC_VERSION_FILE: str = '/objects/documents/{doc_id}/versions/{major_version}/{minor_version}/file' + _URL_DOC_VERSION_THUMBNAIL: str = '/objects/documents/{doc_id}/versions/{major_version}/{minor_version}/thumbnail' + _URL_DOCS_BATCH: str = '/objects/documents/batch' + _URL_DOC_EXTRACT: str = '/objects/documents/batch/actions/fileextract' + _URL_DOC_EXTRACT_VERSIONS: str = '/objects/documents/versions/batch/actions/fileextract' + _URL_DOC_EXTRACT_RESULTS: str = '/objects/documents/batch/actions/fileextract/{jobid}/results' + + _HTTP_HEADER_VAULT_MIGRATION_MODE: str = 'X-VaultAPI-MigrationMode' + + def retrieve_all_document_fields(self) -> DocumentFieldResponse: + """ + **Retrieve All Document Fields** + + Retrieve all standard and custom document fields and field properties. + + Returns: + DocumentFieldResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/properties + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-document-fields](https://developer.veevavault.com/api/24.1/#retrieve-all-document-fields) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentFieldResponse = request.retrieve_all_document_fields() + + # Example Response + properties: List[DocumentField] = response.properties + for document_field in properties: + print('-----Document Field-----') + print(f'Field Name: {document_field.name}') + print(f'Field Type: {document_field.type}') + print(f'Field Required: {document_field.required}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_ALL_FIELDS) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentFieldResponse) + + def retrieve_common_document_fields(self, doc_ids: Set[int]) -> DocumentFieldResponse: + """ + **Retrieve Common Document Fields** + + Retrieve all document fields and field properties which are common to (shared by) a specified set of documents. + This allows you to determine which document fields are eligible for bulk update. + + Returns: + DocumentFieldResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/metadata/objects/documents/properties/find_common + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-common-document-fields](https://developer.veevavault.com/api/24.1/#retrieve-common-document-fields) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentFieldResponse = request.retrieve_common_document_fields(doc_ids=doc_ids) + + # Example Response + properties: List[DocumentField] = response.properties + for document_field in properties: + print('-----Document Field-----') + print(f'Field Name: {document_field.name}') + print(f'Field Type: {document_field.type}') + print(f'Field Required: {document_field.required}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_COMMON_FIELDS) + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_XFORM) + + doc_ids_str: str = ",".join(map(str, doc_ids)) + self._add_body_param('docIds', doc_ids_str) + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=DocumentFieldResponse) + + def retrieve_all_document_types(self) -> DocumentTypesResponse: + """ + **Retrieve All Document Types** + + Retrieve all document types. These are the top-level of the document type/subtype/classification hierarchy. + + Returns: + DocumentTypesResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-document-types](https://developer.veevavault.com/api/24.1/#retrieve-all-document-types) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentTypesResponse = request.retrieve_all_document_types() + + # Example Response + types: List[DocumentTypesResponse.DocumentType] = response.types + for document_type in types: + print('-----Document Type-----') + print(f'Label: {document_type.label}') + print(f'Value: {document_type.value}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_TYPES) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentTypesResponse) + + def retrieve_document_type(self, type: str) -> DocumentTypeHeirarchyResponse: + """ + **Retrieve Document Type** + + Retrieve all metadata from a document type, including all of its subtypes (when available). + + Returns: + DocumentTypeHeirarchyResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types/{type} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-type](https://developer.veevavault.com/api/24.1/#retrieve-document-type) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentTypeHeirarchyResponse = request.retrieve_document_type(type=doc_type) + + # Example Response + print(f'Name: {response.name}') + print(f'Label: {response.label}') + + properties: List[DocumentField] = response.properties + for document_field in properties: + print('-----Document Field-----') + print(f'Field Name: {document_field.name}') + print(f'Field Type: {document_field.type}') + + subtypes: List[DocumentTypeResponse.DocumentSubType] = response.subtypes + for document_subtype in subtypes: + print('-----Document Subtype-----') + print(f'Label: {document_subtype.label}') + print(f'Value: {document_subtype.value}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_TYPE) + endpoint = endpoint.replace('{type}', type) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentTypeHeirarchyResponse) + + def retrieve_document_subtype(self, type: str, subtype: str) -> DocumentTypeHeirarchyResponse: + """ + **Retrieve Document Subtype** + + Retrieve all metadata from a document subtype, including all of its classifications (when available). + + Returns: + DocumentTypeHeirarchyResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types/{type}/subtypes/{subtype} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-subtype](https://developer.veevavault.com/api/24.1/#retrieve-document-subtype) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentTypeHeirarchyResponse = request.retrieve_document_subtype(type=doc_type, subtype=doc_subtype) + + # Example Response + print(f'Name: {response.name}') + print(f'Label: {response.label}') + + properties: List[DocumentField] = response.properties + for document_field in properties: + print('-----Document Field-----') + print(f'Field Name: {document_field.name}') + print(f'Field Type: {document_field.type}') + + classifications: List[DocumentTypeResponse.DocumentSubType] = response.classifications + for classification in classifications: + print('-----Document Classification-----') + print(f'Label: {classification.label}') + print(f'Value: {classification.value}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_SUBTYPE) + endpoint = endpoint.replace('{type}', type) + endpoint = endpoint.replace('{subtype}', subtype) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentTypeHeirarchyResponse) + + def retrieve_document_classification(self, type: str, subtype: str, + classification: str) -> DocumentTypeHeirarchyResponse: + """ + **Retrieve Document Classification** + + Retrieve all metadata from a document classification. + + Returns: + DocumentTypeHeirarchyResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/metadata/objects/documents/types/{type}/subtypes/{subtype}/classifications/{classification} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-classification](https://developer.veevavault.com/api/24.1/#retrieve-document-classification) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response = request.retrieve_document_classification(type=doc_type, subtype=doc_subtype, classification=doc_classification) + + # Example Response + print(f'Name: {response.name}') + print(f'Label: {response.label}') + + properties: List[DocumentField] = response.properties + for document_field in properties: + print('-----Document Field-----') + print(f'Field Name: {document_field.name}') + print(f'Field Type: {document_field.type}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_CLASSIFICATION) + endpoint = endpoint.replace('{type}', type) + endpoint = endpoint.replace('{subtype}', subtype) + endpoint = endpoint.replace('{classification}', classification) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentTypeHeirarchyResponse) + + def retrieve_all_documents(self, named_filter: NamedFilter = None, + scope: Scope = None, + versionscope: VersionsScope = None, + search_keyword: str = None, + limit: int = None, + sort: str = None, + start: int = None) -> DocumentsResponse: + """ + **Retrieve All Documents** + + Retrieve the latest version of documents and binders to which you have access. + + Returns: + DocumentsResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-all-documents](https://developer.veevavault.com/api/24.1/#retrieve-all-documents) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentsResponse = request.retrieve_all_documents() + + # Example Response + documents: List[DocumentsResponse.DocumentNode] = response.documents + for documentNode in documents: + document: Document = documentNode.document + print(f'Document ID: {document.id})') + print(f'Document Name: {document.name__v})') + print(f'Major Version Number: {document.major_version_number__v})') + print(f'Minor Version Number: {document.minor_version_number__v})') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOCS) + + if named_filter is not None: + self._add_query_param('named_filter', named_filter.value) + + if scope is not None: + self._add_query_param('scope', scope.value) + + if versionscope is not None: + self._add_query_param('versionscope', versionscope.value) + + if search_keyword is not None: + self._add_query_param('search_keyword', search_keyword) + + if limit is not None: + self._add_query_param('limit', limit) + + if sort is not None: + self._add_query_param('sort', sort) + + if start is not None: + self._add_query_param('start', start) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentsResponse) + + def retrieve_document(self, doc_id: int) -> DocumentResponse: + """ + **Retrieve Document** + + Retrieve all metadata from a document. + + Returns: + DocumentResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document](https://developer.veevavault.com/api/24.1/#retrieve-document) + + Example: + ```python + # Example Request + request: DocumentRequest = get_vault_client.new_request(DocumentRequest) + response: DocumentResponse = request.retrieve_document(doc_id=doc_id) + + # Example Response + document: Document = response.document + print(f'Document ID: {document.id}') + print(f'Document Name: {document.name__v}') + print(f'Major Version Number: {document.major_version_number__v})') + print(f'Minor Version Number: {document.minor_version_number__v})') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentResponse) + + def retrieve_document_versions(self, doc_id: int) -> DocumentVersionsResponse: + """ + **Retrieve Document Versions** + + Retrieve all versions of a document. + + Returns: + DocumentVersionsResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/versions + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-versions](https://developer.veevavault.com/api/24.1/#retrieve-document-versions) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentVersionsResponse = request.retrieve_document_versions(doc_id=doc_id) + + # Example Response + for version in response.versions: + print('-----Version-----') + print(f'Version Number: {version.number}') + print(f'URL: {version.value}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_VERSIONS) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentVersionsResponse) + + def retrieve_document_version(self, doc_id: int, + major_version: int, + minor_version: int) -> DocumentResponse: + """ + **Retrieve Document Version** + + Retrieve all fields and values configured on a document version. + + Args: + doc_id (int): The document ID. + major_version (int): The major version number. + minor_version (int): The minor version number. + + Returns: + DocumentResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/versions/{major_version}/{minor_version} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-version](https://developer.veevavault.com/api/24.1/#retrieve-document-version) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentResponse = request.retrieve_document_version( + doc_id=doc_id, + major_version=major_version, + minor_version=minor_version) + + # Example Response + document: Document = response.document + print(f'Document ID: {document.id}') + print(f'Document Name: {document.name__v}') + print(f'Major Version Number: {document.major_version_number__v})') + print(f'Minor Version Number: {document.minor_version_number__v})') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_VERSION) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + endpoint = endpoint.replace('{major_version}', str(major_version)) + endpoint = endpoint.replace('{minor_version}', str(minor_version)) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentResponse) + + def download_document_file(self, doc_id: int, lock_document: bool = None) -> VaultResponse: + """ + **Download Document File** + + Download the latest version of the source file from the document. + + Args: + doc_id (int): The document ID. + lock_document (bool): Set to true to Check Out this document before retrieval. + + Returns: + VaultResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/file + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#download-document-file](https://developer.veevavault.com/api/24.1/#download-document-file) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(request_class=DocumentRequest) + response: VaultResponse = request.download_document_file(doc_id=doc_id) + + # Example Response + print(f'File Name: {response.headers.get("Content-Disposition")}') + print(f'Size: {len(response.binary_content)}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_FILE) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + + if lock_document is not None: + self._add_query_param('lockDocument', lock_document) + + self._response_option = _ResponseOption.BYTES + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=VaultResponse) + + def download_document_version_file(self, doc_id: int, + major_version: int, + minor_version: int) -> VaultResponse: + """ + **Download Document Version File** + + Download the file of a specific document version. + + Args: + doc_id (int): The document ID. + major_version (int): The major version number. + minor_version (int): The minor version number. + + Returns: + VaultResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/versions/{major_version}/{minor_version}/file + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#download-document-version-file](https://developer.veevavault.com/api/24.1/#download-document-version-file) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(request_class=DocumentRequest) + response: VaultResponse = request.download_document_version_file(doc_id=doc_id, + major_version=major_version, + minor_version=minor_version) + + # Example Response + print(f'File Name: {response.headers.get("Content-Disposition")}') + print(f'Size: {len(response.binary_content)}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_VERSION_FILE) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + endpoint = endpoint.replace('{major_version}', str(major_version)) + endpoint = endpoint.replace('{minor_version}', str(minor_version)) + + self._response_option = _ResponseOption.BYTES + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=VaultResponse) + + def download_document_version_thumbnail_file(self, doc_id: int, + major_version: int, + minor_version: int) -> VaultResponse: + """ + **Download Document Version Thumbnail File** + + Download the thumbnail image file of a specific document version. + + Args: + doc_id (int): The document ID. + major_version (int): The major version number. + minor_version (int): The minor version number. + + Returns: + VaultResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/{doc_id}/versions/{major_version}/{minor_version}/thumbnail + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#download-document-version-thumbnail-file](https://developer.veevavault.com/api/24.1/#download-document-version-thumbnail-file) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(request_class=DocumentRequest) + response: VaultResponse = request.download_document_version_thumbnail_file(doc_id=doc_id, + major_version=major_version, + minor_version=minor_version) + + # Example Response + print(f'Size: {len(response.binary_content)}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_VERSION_THUMBNAIL) + endpoint = endpoint.replace('{doc_id}', str(doc_id)) + endpoint = endpoint.replace('{major_version}', str(major_version)) + endpoint = endpoint.replace('{minor_version}', str(minor_version)) + + self._response_option = _ResponseOption.BYTES + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=VaultResponse) + + def create_single_document(self, doc: Document) -> DocumentResponse: + """ + **Create Single Document** + + Create a single document. + + Args: + doc (Document): The document object. + + Returns: + DocumentResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/objects/documents + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#create-single-document](https://developer.veevavault.com/api/24.1/#create-single-document) + + Example: + ```python + # Example Request + document: Document = Document() + document.name__v = 'Test Document' + document.type__v = 'VAPIL Test Doc Type' + document.subtype__v = 'VAPIL Test Doc Subtype' + document.classification__v = 'VAPIL Test Doc Classification' + document.lifecycle__v = 'VAPIL Test Doc Lifecycle' + + request: DocumentRequest = vault_client.new_request(request_class=DocumentRequest) + response: DocumentResponse = request.create_single_document(doc=document) + + # Example Response + print(f'Document ID: {response.id}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOCS) + + self._body_params = doc.__dict__ + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=DocumentResponse) + + def create_multiple_documents(self, input_path: str = None, + request_string: str = None, + migration_mode: bool = False) -> DocumentBulkResponse: + """ + **Create Multiple Documents** + + This endpoint allows you to create multiple documents at once with a CSV input file. + + Args: + input_path (str): The path to the input CSV file. + request_string (str): The csv request string. + migration_mode (bool): Set to true to enable migration mode. + + Returns: + DocumentBulkResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/objects/documents/batch + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#create-multiple-documents](https://developer.veevavault.com/api/24.1/#create-multiple-documents) + + Example: + ```python + # Example Request + request: DocumentRequest = vault_client.new_request(request_class=DocumentRequest) + response: DocumentBulkResponse = request.create_multiple_documents(input_path=csv_path) + + # Example Response + data: List[DocumentResponse] = response.data + for document_response in data: + print(f'Response Status: {document_response.responseStatus}') + print(f'Document ID: {document_response.id}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOCS_BATCH) + + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_CSV) + + if migration_mode: + self._add_query_param(self._HTTP_HEADER_VAULT_MIGRATION_MODE, migration_mode) + + if input_path is not None: + content = None + with open(input_path, 'r', encoding='utf-8') as file: + content = file.read() + self._body_params = content + + if request_string: + self._add_raw_string(request_string) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=DocumentBulkResponse) + + def update_single_document(self, doc: Document) -> DocumentResponse: + """ + **Update Single Document** + + Update editable field values on the latest version of a single document. + + Args: + doc (Document): The document object. + + Returns: + DocumentResponse: Modeled response from Vault + + Vault API Endpoint: + PUT /api/{version}/objects/documents/{doc_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#update-single-document](https://developer.veevavault.com/api/24.2/#update-single-document) + + Example: + ```python + # Example Request + document: Document = Document() + document.id = doc_id + document.name__v = 'Test Document Update' + + request: DocumentRequest = get_vault_client.new_request(request_class=DocumentRequest) + response: DocumentResponse = request.update_single_document(doc=document) + + # Example Response + print(f'Document ID: {response.id}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC) + endpoint = endpoint.replace('{doc_id}', str(doc.id)) + + self._body_params = doc.__dict__ + return self._send(http_method=HttpMethod.PUT, + url=endpoint, + response_class=DocumentResponse) + + def export_documents(self, request_string: str = None, + include_source: bool = True, + include_renditions: bool = False, + include_allversions: bool = False) -> JobCreateResponse: + """ + **Export Documents** + + Use this request to export a set of documents to your Vault’s file staging server. + + Args: + request_string (str): The json request string. + include_source (bool): Include the source file. If omitted, defaults to true. + include_renditions (bool): Include renditions. If omitted, defaults to false. + include_allversions (bool): Include all versions. If omitted, defaults to false. + + Returns: + JobCreateResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/objects/documents/batch/actions/fileextract + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#export-documents-1](https://developer.veevavault.com/api/24.1/#export-documents-1) + + Example: + ```python + # Example Request + request: DocumentRequest = get_vault_client.new_request(request_class=DocumentRequest) + response: JobCreateResponse = request.export_documents(request_string=json_string) + + # Example Response + print(f'Job ID: {response.job_id}') + print(f'URL: {response.url}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_EXTRACT) + self._add_body_param('source', include_source) + self._add_body_param('renditions', include_renditions) + self._add_body_param('allversions', include_allversions) + + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_JSON) + + if request_string: + self._add_raw_string(request_string) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=JobCreateResponse) + + def export_document_versions(self, request_string: str = None, + include_source: bool = True, + include_renditions: bool = False) -> JobCreateResponse: + """ + **Export Document Versions** + + Export a specific set of document versions to your Vault’s file staging server. The files you export go to the u{userID} folder, regardless of your security profile. + + Args: + request_string (str): The json request string. + include_source (bool): Include the source file. If omitted, defaults to true. + include_renditions (bool): Include renditions. If omitted, defaults to false. + + Returns: + JobCreateResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/objects/documents/versions/batch/actions/fileextract + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#export-document-versions](https://developer.veevavault.com/api/24.1/#export-document-versions) + + Example: + ```python + # Example Request + request: DocumentRequest = get_vault_client.new_request(request_class=DocumentRequest) + response: JobCreateResponse = request.export_document_versions(request_string=json_string) + + # Example Response + print(f'Job ID: {response.job_id}') + print(f'URL: {response.url}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_EXTRACT) + self._add_body_param('source', include_source) + self._add_body_param('renditions', include_renditions) + + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_JSON) + + if request_string: + self._add_raw_string(request_string) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=JobCreateResponse) + + def retrieve_document_export_results(self, job_id: int) -> DocumentExportResponse: + """ + **Retrieve Document Export Results** + + After submitting a request to export documents from your Vault, you can query your Vault to determine the results of the request. + + Args: + job_id (int): The job ID. + + Returns: + DocumentExportResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/objects/documents/batch/actions/fileextract/{jobid}/results + + Vault API Documentation: + [https://developer.veevavault.com/api/24.1/#retrieve-document-export-results](https://developer.veevavault.com/api/24.1/#retrieve-document-export-results) + + Example: + ```python + # Example Request + request: DocumentRequest = get_vault_client.new_request(request_class=DocumentRequest) + response: DocumentExportResponse = request.retrieve_document_export_results(job_id=job_id) + + # Example Response + data: List[DocumentExportResponse.ExportedDocument] = response.data + for exported_document in data: + print('-----Exported Document-----') + print(f'Response Status: {exported_document.responseStatus}') + print(f'ID: {exported_document.id}') + print(f'Major Version Number: {exported_document.major_version_number__v}') + print(f'Minor Version Number: {exported_document.minor_version_number__v}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOC_EXTRACT_RESULTS) + endpoint = endpoint.replace('{jobid}', str(job_id)) + + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_JSON) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=DocumentExportResponse) diff --git a/common/api/request/file_staging_request.py b/common/api/request/file_staging_request.py new file mode 100644 index 0000000..6e58991 --- /dev/null +++ b/common/api/request/file_staging_request.py @@ -0,0 +1,694 @@ +""" +Module that defines classes used to send File Staging requests to the Vault API. +""" + +from enum import Enum + +from ..connector import http_request_connector +from ..connector.http_request_connector import HttpMethod +from ..model.response.file_staging_response import FileStagingItemBulkResponse +from ..model.response.file_staging_response import FileStagingItemResponse +from ..model.response.file_staging_response import FileStagingJobResponse +from ..model.response.file_staging_response import FileStagingSessionBulkResponse +from ..model.response.file_staging_response import FileStagingSessionPartBulkResponse +from ..model.response.file_staging_response import FileStagingSessionPartResponse +from ..model.response.file_staging_response import FileStagingSessionResponse +from ..model.response.vault_response import VaultResponse +from ..request.vault_request import VaultRequest, _RequestOption, _ResponseOption + + +class Kind(Enum): + """ + Enumeration class representing different file staging items. + + Attributes: + FILE (str): File + FOLDER (str): Folder + """ + + FILE: str = 'file' + FOLDER: str = 'folder' + + +class FileStagingRequest(VaultRequest): + """ + Class that defines methods used to call File Staging endpoints. + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#file-staging](https://developer.veevavault.com/api/24.2/#file-staging) + """ + + _HTTP_HEADER_FILE_PART_NUMBER: str = "X-VaultAPI-FilePartNumber" + + _URL_LIST_ITEMS_AT_A_PATH: str = "/services/file_staging/items/{item}" + _URL_DOWNLOAD_ITEM_CONTENT: str = "/services/file_staging/items/content/{item}" + _URL_FILE_STAGING_CREATE_FILE_OR_FOLDER: str = "/services/file_staging/items" + _URL_FILE_STAGING_UPDATE_OR_DELETE_FILE_OR_FOLDER: str = "/services/file_staging/items/{item}"; + _URL_FILE_STAGING_CREATE_RESUMABLE_UPLOAD_SESSION: str = "/services/file_staging/upload" + _URL_FILE_STAGING_RESUMABLE_SESSION: str = "/services/file_staging/upload/{upload_session_id}" + _URL_FILE_STAGING_RESUMABLE_SESSION_PARTS: str = "/services/file_staging/upload/{upload_session_id}/parts" + + _RECURSIVE_PARAMETER: str = "recursive" + _LIMIT_PARAMETER: str = "limit" + _FORMAT_RESULT_PARAMETER: str = "format_result" + _KIND_PARAMETER: str = "kind" + _PATH_PARAMETER: str = "path" + _OVERWRITE_PARAMETER: str = "overwrite" + _FILE_PARAMETER: str = "file" + _PARENT_PARAMETER: str = "parent" + _NAME_PARAMETER: str = "name" + _SIZE_PARAMETER: str = "size" + _CHUNK_PARAMETER: str = "@/chunk-ab." + + def list_items_at_a_path(self, item: str = '', + recursive: bool = None, + limit: int = None, + format_result: str = None) -> FileStagingItemBulkResponse: + """ + **List Items at a Path** + + Return a list of files and folders for the specified path. + Paths are different for Admin users (Vault Owners and System Admins) and non-Admin users. + + Args: + item (str): Path to the file or folder + recursive (bool): If true, the response will contain the contents of all subfolders. If not specified, the default value is false + limit (int): The maximum number of items per page in the response. This can be any value between 1 and 1000. If omitted, the default value is 1000 + format_result (str): If set to csv, the response includes a job_id. Use the Job ID value to retrieve the status and results of the request + + Returns: + FileStagingItemBulkResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/items/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-items-at-a-path](https://developer.veevavault.com/api/24.2/#list-items-at-a-path) + + Example: + ```python + # Example Request + user_folder: str = 'u1234567' + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingItemBulkResponse = request.list_items_at_a_path(recursive=True, item=user_folder) + + # Example Response + for item in response.data: + print('-----Item-----') + print(f'Name: {item.name}') + print(f'Kind: {item.kind}') + print(f'Path: {item.path}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_LIST_ITEMS_AT_A_PATH) + endpoint = endpoint.replace('{item}', item) + + if recursive is not None: + self._add_query_param(self._RECURSIVE_PARAMETER, recursive) + + if limit is not None: + self._add_query_param(self._LIMIT_PARAMETER, limit) + + if format_result is not None: + self._add_query_param(self._FORMAT_RESULT_PARAMETER, format_result) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingItemBulkResponse) + + def list_items_at_a_path_by_page(self, page_url: str) -> FileStagingItemBulkResponse: + """ + **List Items at a Path by Page** + + Return a list of files and folders for the specified page url. + + Args: + page_url (str): full path to the page (including https://{vaultDNS}/api/{version}/) + + Returns: + FileStagingItemBulkResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/items/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-items-at-a-path](https://developer.veevavault.com/api/24.2/#list-items-at-a-path) + + Example: + ```python + # Example Request + next_page_url: str = list_response.responseDetails.next_page + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingItemBulkResponse = request.list_items_at_a_path_by_page(next_page_url) + + # Example Response + for item in response.data: + print('-----Item-----') + print(f'Name: {item.name}') + print(f'Kind: {item.kind}') + print(f'Path: {item.path}') + ``` + """ + + endpoint = self.get_pagination_endpoint(page_url) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingItemBulkResponse) + + def download_item_content(self, item: str = '', + byte_range: str = None) -> VaultResponse: + """ + **Download Item Content** + + Retrieve the content of a specified file from the file staging server. Use the Range header to create resumable downloads for large files, + or to continue downloading a file if your session is interrupted. + + Args: + item (str): Path to the file + byte_range (str): Specifies a partial range of bytes to include in the download + + Returns: + VaultResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/items/content/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#download-item-content](https://developer.veevavault.com/api/24.2/#download-item-content) + + Example: + ```python + # Example Request + file_path: str = "u1234567/test_document.docx" + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response = request.download_item_content(item=file_path) + + # Example Response + print(f'Size: {len(response.binary_content)}') + ``` + """ + endpoint = self.get_api_endpoint(endpoint=self._URL_DOWNLOAD_ITEM_CONTENT) + endpoint = endpoint.replace('{item}', item) + + if byte_range is not None: + self._add_header_param(http_request_connector.HTTP_HEADER_RANGE, byte_range) + + self._response_option = _ResponseOption.BYTES + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=VaultResponse) + + def create_folder_or_file(self, kind: Kind, + path: str, + overwrite: bool = None, + input_path: str = None, + content_md5: str = None) -> FileStagingItemResponse: + """ + **Create Folder or File** + + Upload files or folders up to 50MB to the File Staging Server. + + Args: + kind (Kind): a Kind enum value representing the type of the item. Can be either FILE or FOLDER type + path (str): The absolute path, including file or folder name, to place the item in the file staging server + overwrite (bool): If set to true, Vault will overwrite any existing files with the same name at the specified destination. For folders, this is always false + input_path (str): Path to the file or folder to upload + content_md5 (str): The MD5 checksum of the file being uploaded + + Returns: + FileStagingItemResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/services/file_staging/items + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#create-folder-or-file](https://developer.veevavault.com/api/24.2/#create-folder-or-file) + + Example: + ```python + # Example Request + test_folder: str = 'u1234567/test_create_folder' + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingItemResponse = request.create_folder_or_file(kind=Kind.FOLDER, + path=test_folder) + + # Example Response + item: FileStagingItem = response.data + print(f'Name: {item.name}') + print(f'Kind: {item.kind}') + print(f'Path: {item.path}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_CREATE_FILE_OR_FOLDER) + + self._add_body_param(self._KIND_PARAMETER, kind.value) + self._add_body_param(self._PATH_PARAMETER, path) + + if overwrite is not None: + self._add_body_param(self._OVERWRITE_PARAMETER, overwrite) + + if content_md5 is not None: + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_MD5, content_md5) + + if input_path is not None: + self._add_file_multipart(self._FILE_PARAMETER, input_path) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=FileStagingItemResponse) + + def update_folder_or_file(self, item: str, + parent: str = None, + name: str = None) -> FileStagingJobResponse: + """ + **Update Folder or File** + + Move or rename a folder or file on the file staging server. You can move and rename an item in the same request. + + Args: + item (str): The absolute path to a file or folder. This path is specific to the authenticated user. Admin users can access the root directory. All other users can only access their own user directory + parent (str): When moving a file or folder, specifies the absolute path to the parent directory in which to place the file + name (str): When renaming a file or folder, specifies the new name + + Returns: + FileStagingJobResponse: Modeled response from Vault + + Vault API Endpoint: + PUT /api/{version}/services/file_staging/items/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#update-folder-or-file](https://developer.veevavault.com/api/24.2/#update-folder-or-file) + + Example: + ```python + # Example Request + current_folder_path: str = 'u1234567/test_create_folder' + new_folder_name: str = 'test_update_folder' + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingJobResponse = request.update_folder_or_file(item=current_folder_path, + name=new_folder_name) + + # Example Response + print(f'Job ID: {response.data.job_id}') + print(f'URL: {response.data.url}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_UPDATE_OR_DELETE_FILE_OR_FOLDER) + endpoint = endpoint.replace('{item}', item) + + if parent is not None: + self._add_body_param(self._PARENT_PARAMETER, parent) + + if name is not None: + self._add_body_param(self._NAME_PARAMETER, name) + + return self._send(http_method=HttpMethod.PUT, + url=endpoint, + response_class=FileStagingJobResponse) + + def delete_folder_or_file(self, item: str, + recursive: bool = None) -> FileStagingJobResponse: + """ + **Delete Folder or File** + + Delete an individual file or folder from the file staging server. + + Args: + item (str): The absolute path to a file or folder. This path is specific to the authenticated user. Admin users can access the root directory. All other users can only access their own user directory + recursive (bool): Applicable to deleting folders only. If true, the request will delete the contents of a folder and all subfolders. The default is false + + Returns: + FileStagingJobResponse: Modeled response from Vault + + Vault API Endpoint: + DELETE /api/{version}/services/file_staging/items/{item} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#delete-file-or-folder](https://developer.veevavault.com/api/24.2/#delete-file-or-folder) + + Example: + ```python + # Example Request + folder_path: str = 'u1234567/test_create_folder' + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingJobResponse = request.delete_folder_or_file(folder_path) + + # Example Response + print(f'Job ID: {response.data.job_id}') + print(f'URL: {response.data.url}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_UPDATE_OR_DELETE_FILE_OR_FOLDER) + endpoint = endpoint.replace('{item}', item) + + if recursive is not None: + self._add_query_param(self._RECURSIVE_PARAMETER, recursive) + + return self._send(http_method=HttpMethod.DELETE, + url=endpoint, + response_class=FileStagingJobResponse) + + def create_resumable_upload_session(self, path: str, + size: int, + overwrite: bool = None) -> FileStagingSessionResponse: + """ + **Create Resumable Upload Session** + + Initiate a multipart upload session and return an upload session ID. + + Args: + path (str): The absolute path, including file name, to place the file in the file staging server + size (int): The size of the file in bytes + overwrite (bool): If set to true, Vault will overwrite any existing files with the same name at the specified destination + + Returns: + FileStagingSessionResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/services/file_staging/upload + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#create-resumable-upload-session](https://developer.veevavault.com/api/24.2/#create-resumable-upload-session) + + Example: + ```python + # Example Request + local_file_path: str = 'path/to/file.txt' + file_size: int = os.path.getsize(local_file_path) + file_staging_path: str = 'u1234567/vapil_test_document.docx' + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingSessionResponse = request.create_resumable_upload_session(path=file_staging_path, + size=file_size, + overwrite=True) + + # Example Response + print(f'ID: {response.data.id}') + print(f'Path: {response.data.path}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_CREATE_RESUMABLE_UPLOAD_SESSION) + + self._add_body_param(self._PATH_PARAMETER, path) + self._add_body_param(self._SIZE_PARAMETER, size) + + if overwrite is not None: + self._add_body_param(self._OVERWRITE_PARAMETER, overwrite) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=FileStagingSessionResponse) + + def upload_to_a_session(self, upload_session_id: str, + part_number: str, + content_md5: str = None, + file_path: str = None) -> FileStagingSessionPartResponse: + """ + **Upload to a Session** + + The session owner can upload parts of a file to an active upload session. + By default, you can upload up to 2000 parts per upload session, and each part can be up to 50MB. + Use the Range header to specify the range of bytes for each upload, or split files into parts and add each part as a separate file. + Each part must be the same size, except for the last part in the upload session. + + Args: + upload_session_id (str): The upload session ID + part_number (str): The part number of the file being uploaded + content_md5 (str): The MD5 checksum of the file being uploaded + file_path (str): Path to the file to upload + + Returns: + FileStagingSessionPartResponse: Modeled response from Vault + + Vault API Endpoint: + PUT /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#upload-to-a-session](https://developer.veevavault.com/api/24.2/#upload-to-a-session) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingSessionPartResponse = request.upload_to_a_session(upload_session_id=upload_session_id, + part_number='1', + file_path=file_path) + + # Example Response + print(f'Size: {response.data.size}') + print(f'MD5: {response.data.part_content_md5}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_RESUMABLE_SESSION) + endpoint = endpoint.replace('{upload_session_id}', upload_session_id) + + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_TYPE, + http_request_connector.HTTP_CONTENT_TYPE_OCTET) + self._add_header_param(self._HTTP_HEADER_FILE_PART_NUMBER, part_number) + + if content_md5 is not None: + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_MD5, content_md5) + + if file_path is not None: + binary_content = None + with open(file_path, 'rb') as file: + binary_content = file.read() + self._binary_content = binary_content + self._request_option = _RequestOption.BYTES + self._add_header_param(http_request_connector.HTTP_HEADER_CONTENT_LENGTH, str(len(binary_content))) + + return self._send(http_method=HttpMethod.PUT, + url=endpoint, + response_class=FileStagingSessionPartResponse) + + def commit_upload_session(self, upload_session_id: str) -> FileStagingJobResponse: + """ + **Commit Upload Session** + + Mark an upload session as complete and assemble all previously uploaded parts to create a file. + + Args: + upload_session_id (str): The upload session ID + + Returns: + FileStagingJobResponse: Modeled response from Vault + + Vault API Endpoint: + POST /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#commit-upload-session](https://developer.veevavault.com/api/24.2/#commit-upload-session) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingJobResponse = request.commit_upload_session(upload_session_id=upload_session_id) + + # Example Response + print(f'Job ID: {response.data.job_id}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_RESUMABLE_SESSION) + endpoint = endpoint.replace('{upload_session_id}', upload_session_id) + + return self._send(http_method=HttpMethod.POST, + url=endpoint, + response_class=FileStagingJobResponse) + + def abort_upload_session(self, upload_session_id: str) -> VaultResponse: + """ + **Abort Upload Session** + + Abort an active upload session and purge all uploaded file parts. Admin users can see and abort all upload sessions, + while non-Admin users can only see and abort sessions where they are the owner. + + Args: + upload_session_id (str): The upload session ID + + Returns: + VaultResponse: Modeled response from Vault + + Vault API Endpoint: + DELETE /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#abort-upload-session](https://developer.veevavault.com/api/24.2/#abort-upload-session) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: VaultResponse = request.abort_upload_session(upload_session_id=upload_session_id) + + # Example Response + print(f'Response Status: {response.responseStatus}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_RESUMABLE_SESSION) + endpoint = endpoint.replace('{upload_session_id}', upload_session_id) + + return self._send(http_method=HttpMethod.DELETE, + url=endpoint, + response_class=VaultResponse) + + def list_upload_sessions(self) -> FileStagingSessionBulkResponse: + """ + **List Upload Sessions** + + Return a list of active upload sessions. + + Returns: + FileStagingSessionBulkResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-upload-sessions](https://developer.veevavault.com/api/24.2/#list-upload-sessions) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingSessionBulkResponse = request.list_upload_sessions() + + # Example Response + for session in response.data: + print('-----Session-----') + print(f'ID: {session.id}') + print(f'Path: {session.path}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_CREATE_RESUMABLE_UPLOAD_SESSION) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingSessionBulkResponse) + + def list_upload_sessions_by_page(self, page_url: str) -> FileStagingSessionBulkResponse: + """ + **List Upload Sessions by Page** + + Return a list of active upload sessions using the previous_page or next_page parameter of a previous request. + + Args: + page_url (str): full path to the page (including https://{vaultDNS}/api/{version}/) + + Returns: + FileStagingSessionBulkResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-upload-sessions](https://developer.veevavault.com/api/24.2/#list-upload-sessions) + + Example: + ```python + # Example Request + + # Example Response + + ``` + """ + + endpoint = self.get_pagination_endpoint(page_url) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingSessionBulkResponse) + + def get_upload_session_details(self, upload_session_id: str) -> FileStagingSessionResponse: + """ + **Get Upload Session Details** + + Retrieve the details of an active upload session. Admin users can get details for all sessions, + while non-Admin users can only get details for sessions if they are the owner. + + Args: + upload_session_id (str): The upload session ID + + Returns: + FileStagingSessionResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload/{upload_session_id} + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#get-upload-session-details](https://developer.veevavault.com/api/24.2/#get-upload-session-details) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingSessionResponse = request.get_upload_session_details(upload_session_id=upload_session_id) + + # Example Response + print(f'ID: {response.data.id}') + print(f'Path: {response.data.path}') + print(f'Uploaded Parts: {response.data.uploaded_parts}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_RESUMABLE_SESSION) + endpoint = endpoint.replace('{upload_session_id}', upload_session_id) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingSessionResponse) + + def list_file_parts_uploaded_to_a_session(self, upload_session_id: str, + limit: int = None) -> FileStagingSessionPartBulkResponse: + """ + **List File Parts Uploaded to a Session** + + Return a list of parts uploaded in a session. You must be an Admin user or the session owner. + + Args: + upload_session_id (str): The upload session ID + limit (int): The maximum number of items per page in the response. This can be any value between 1 and 1000. If omitted, the default value is 1000 + + Returns: + FileStagingSessionPartBulkResponse: Modeled response from Vault + + Vault API Endpoint: + GET /api/{version}/services/file_staging/upload/{upload_session_id}/parts + + Vault API Documentation: + [https://developer.veevavault.com/api/24.2/#list-file-parts-uploaded-to-session](https://developer.veevavault.com/api/24.2/#list-file-parts-uploaded-to-session) + + Example: + ```python + # Example Request + request: FileStagingRequest = vault_client.new_request(request_class=FileStagingRequest) + response: FileStagingSessionPartBulkResponse = request.list_file_parts_uploaded_to_a_session(upload_session_id) + + # Example Response + for part in response.data: + print('-----Part-----') + print(f'Part Number: {part.part_number}') + print(f'Size: {part.size}') + print(f'MD5: {part.part_content_md5}') + ``` + """ + + endpoint = self.get_api_endpoint(endpoint=self._URL_FILE_STAGING_RESUMABLE_SESSION_PARTS) + endpoint = endpoint.replace('{upload_session_id}', upload_session_id) + + if limit is not None: + self._add_query_param(self._LIMIT_PARAMETER, limit) + + return self._send(http_method=HttpMethod.GET, + url=endpoint, + response_class=FileStagingSessionPartBulkResponse) diff --git a/common/api/request/vault_request.py b/common/api/request/vault_request.py index 3cbeed1..0653dfc 100644 --- a/common/api/request/vault_request.py +++ b/common/api/request/vault_request.py @@ -35,6 +35,19 @@ class _RequestOption(Enum): EMPTY = 'EMPTY' BYTES = 'BYTES' + STRING = 'STRING' + + +class _ResponseOption(Enum): + # Enumeration class representing different options for handling request data in a Vault request. + # + # Attributes: + # EMPTY (RequestOption): Represents an empty request option + # BYTES (RequestOption): Represents a request option for handling raw bytes + + STRING = 'STRING' + TO_FILE = 'TO_FILE' + BYTES = 'BYTES' @dataclass @@ -52,33 +65,34 @@ class VaultRequest(ABC): the Reference ID is returned in the response headers of the returned Response class. """ - VAULT_API_VERSION: str = 'v24.1' + VAULT_API_VERSION: str = 'v24.2' HTTP_HEADER_AUTHORIZATION: str = 'Authorization' HTTP_HEADER_VAULT_CLIENT_ID: str = 'X-VaultAPI-ClientID' HTTP_HEADER_REFERENCE_ID: str = "X-VaultAPI-ReferenceId" reference_id: str = None - _header_params: Dict[str, Any] = Field(default_factory=dict) - _body_params: Dict[Any, Any] = Field(default_factory=dict) - _query_params: Dict[str, Any] = Field(default_factory=dict) - _file_params: Dict[str, Any] = Field(default_factory=dict) - _binary_content: bytes = None - _vault_dns: str = None - _vault_username: str = None - _vault_password: str = None - _vault_client_id: str = None - _http_timeout: int = 60 - _set_log_api_errors: bool = True - _idp_oauth_access_token: str = None - _idp_oauth_scope: str = 'openid' - _idp_password: str = None - _idp_username: str = None - _idp_client_id: str = None - _set_validate_session: bool = True - _vault_oauth_client_id: str = None - _vault_oauth_profile_id: str = None - _vault_session_id: str = None - _request_option: _RequestOption = _RequestOption.EMPTY + _header_params: Dict[str, Any] = Field(default_factory=dict, alias="header_params") + _body_params: Dict[Any, Any] = Field(default_factory=dict, alias="body_params") + _query_params: Dict[str, Any] = Field(default_factory=dict, alias="query_params") + _file_params: Dict[str, Any] = Field(default_factory=dict, alias="file_params") + _request_raw_string: str = Field(default=None, alias="request_raw_string") + _binary_content: bytes = Field(default=None, alias="binary_content") + _vault_dns: str = Field(default=None, alias="vault_dns") + _vault_username: str = Field(default=None, alias="vault_username") + _vault_password: str = Field(default=None, alias="vault_password") + _vault_client_id: str = Field(default=None, alias="vault_client_id") + _http_timeout: int = Field(default=60, alias="http_timeout") + _set_log_api_errors: bool = Field(default=True, alias="set_log_api_errors") + _idp_oauth_access_token: str = Field(default=None, alias="idp_oauth_access_token") + _idp_oauth_scope: str = Field(default='openid', alias="idp_oauth_scope") + _idp_password: str = Field(default=None, alias="idp_password") + _idp_username: str = Field(default=None, alias="idp_username") + _idp_client_id: str = Field(default=None, alias="idp_client_id") + _set_validate_session: bool = Field(default=True, alias="set_validate_session") + _vault_oauth_client_id: str = Field(default=None, alias="vault_oauth_client_id") + _vault_oauth_profile_id: str = Field(default=None, alias="vault_oauth_profile_id") + _vault_session_id: str = Field(default=None, alias="vault_session_id") + _request_option: _RequestOption = Field(default=_RequestOption.EMPTY, alias="request_option") def _send(self, http_method: http_request_connector.HttpMethod, url: str, @@ -163,6 +177,15 @@ def _add_query_param(self, key: str, value: Any): # value (Any): The value of the parameter self._add_param(param_type=_ParamType.QUERY, key=key, value=value) + def _add_raw_string(self, raw_string: str): + # Add a string to the request, such as POST of raw data + # + # Args: + # raw_string (str): The raw string to be included in the request + + self._request_option = _RequestOption.STRING + self._request_raw_string = raw_string + def _add_file_multipart(self, param_name: str, file_path: str): # Add a file parameter for multipart/form-data in the request. # @@ -250,5 +273,7 @@ def _get_request_body(self) -> Any: return self._body_params elif self._request_option == _RequestOption.BYTES: return self._binary_content + elif self._request_option == _RequestOption.STRING: + return self._request_raw_string else: return None diff --git a/common/aws_utilities.py b/common/aws_utilities.py index baf8044..bfcb0c7 100644 --- a/common/aws_utilities.py +++ b/common/aws_utilities.py @@ -1,3 +1,5 @@ +import json + import psycopg2 import boto3 from .log_message import log_message @@ -72,8 +74,8 @@ def generate_enviroment_variables(job_parameters): environment_variables.append({'name': 'STEP', 'value': job_parameters['step']}) if 'source_filepath' in job_parameters and job_parameters['source_filepath'] is not None: environment_variables.append({'name': 'SOURCE_FILEPATH', 'value': job_parameters['source_filepath']}) - if 'target_filepath' in job_parameters and job_parameters['target_filepath'] is not None: - environment_variables.append({'name': 'TARGET_FILEPATH', 'value': job_parameters['target_filepath']}) + if 'target_directory' in job_parameters and job_parameters['target_directory'] is not None: + environment_variables.append({'name': 'TARGET_DIRECTORY', 'value': job_parameters['target_directory']}) if 'continue_processing' in job_parameters and job_parameters['continue_processing'] is not None: environment_variables.append({'name': 'CONTINUE_PROCESSING', 'value': job_parameters['continue_processing']}) if 'start_time' in job_parameters and job_parameters['start_time'] is not None: @@ -82,6 +84,15 @@ def generate_enviroment_variables(job_parameters): environment_variables.append({'name': 'STOP_TIME', 'value': job_parameters['stop_time']}) if 'extract_type' in job_parameters and job_parameters['extract_type'] is not None: environment_variables.append({'name': 'EXTRACT_TYPE', 'value': job_parameters['extract_type']}) + if 'doc_version_ids' in job_parameters and job_parameters['doc_version_ids'] is not None: + environment_variables.append({'name': 'DOC_VERSION_IDS', 'value': job_parameters['doc_version_ids']}) + if 'extract_source_content' in job_parameters and job_parameters['extract_source_content'] is not None: + environment_variables.append( + {'name': 'EXTRACT_SOURCE_CONTENT', 'value': job_parameters['extract_source_content']}) + if 'secret_name' in job_parameters and job_parameters['secret_name'] is not None: + environment_variables.append({'name': 'SECRET_NAME', 'value': job_parameters['secret_name']}) + if 'secret' in job_parameters and job_parameters['secret'] is not None: + environment_variables.append({'name': 'SECRET', 'value': job_parameters['secret']}) return environment_variables @@ -102,14 +113,22 @@ def generate_command_overrides(job_parameters): container_command.extend(["--step", job_parameters['step']]) if 'source_filepath' in job_parameters and job_parameters['source_filepath'] is not None: container_command.extend(["--source_filepath", job_parameters['source_filepath']]) - if 'target_filepath' in job_parameters and job_parameters['target_filepath'] is not None: - container_command.extend(["--target_filepath", job_parameters['target_filepath']]) + if 'target_directory' in job_parameters and job_parameters['target_directory'] is not None: + container_command.extend(["--target_directory", job_parameters['target_directory']]) if 'continue_processing' in job_parameters and job_parameters['continue_processing'] is not None: container_command.extend(["--continue_processing", job_parameters['continue_processing']]) if 'start_time' in job_parameters and job_parameters['start_time'] is not None: container_command.extend(["--start_time", job_parameters['start_time']]) if 'stop_time' in job_parameters and job_parameters['stop_time'] is not None: container_command.extend(["--stop_time", job_parameters['stop_time']]) + if 'extract_type' in job_parameters and job_parameters['extract_type'] is not None: + container_command.extend(["--extract_type", job_parameters['extract_type']]) + if 'doc_version_ids' in job_parameters and job_parameters['doc_version_ids'] is not None: + container_command.extend(["--doc_version_ids", job_parameters['doc_version_ids']]) + if 'secret_name' in job_parameters and job_parameters['secret_name'] is not None: + container_command.extend(["--secret_name", job_parameters['secret_name']]) + if 'secret' in job_parameters and job_parameters['secret'] is not None: + container_command.extend(["--secret", job_parameters['secret']]) return container_command @@ -129,6 +148,153 @@ def get_batch_region(): raise Exception("No compute environments found.") +def upload_large_file(s3, bucket_name, key, file_content): + try: + # Initiate multipart upload + multipart_upload = s3.create_multipart_upload(Bucket=bucket_name, Key=key) + upload_id = multipart_upload['UploadId'] + + # Upload parts + parts = [] + part_size = 5 * 1024 * 1024 # 5 MB + for i in range(0, len(file_content), part_size): + part_num = len(parts) + 1 + part_data = file_content[i:i + part_size] + part = s3.upload_part( + Bucket=bucket_name, + Key=key, + PartNumber=part_num, + UploadId=upload_id, + Body=part_data + ) + parts.append({'PartNumber': part_num, 'ETag': part['ETag']}) + + # Complete multipart upload + s3.complete_multipart_upload( + Bucket=bucket_name, + Key=key, + UploadId=upload_id, + MultipartUpload={'Parts': parts} + ) + except ClientError as e: + s3.abort_multipart_upload(Bucket=bucket_name, Key=key, UploadId=upload_id) + log_message(log_level='Error', + message=f'Multipart upload failed', + exception=e, + context=None) + raise e + + +def get_s3_path(filename, s3_bucket, subfolder): + """ + + :param filename: The name of the file to locate + :param s3_bucket: The name of the S3 bucket + :param subfolder: The directory the file is located + :param file_type: full, updates or deletes. Depending on these choices, the file key is searched. + :return: + """ + try: + s3 = boto3.resource('s3') + bucket = s3.Bucket(s3_bucket) + prefix = f"{subfolder}/" + for obj in bucket.objects.filter(Prefix=prefix): + if filename in obj.key: + return f"s3://{s3_bucket}/{obj.key}" + log_message(log_level='Info', + message=f'For {filename}, s3 file not found', + exception=None, context=None) + except Exception as e: + raise e + return None + + +def retrieve_doc_version_ids_from_s3(file_path: str) -> list[str]: + s3 = boto3.client('s3') + + # Check if file_path is a filepath in S3 + if file_path.startswith("s3://"): + # It's an S3 path, so parse the bucket and key + s3_path_parts = file_path[5:].split("/", 1) + bucket_name = s3_path_parts[0] + key = s3_path_parts[1] + + try: + # Download the file from S3 + response = s3.get_object(Bucket=bucket_name, Key=key) + file_content = response['Body'].read().decode('utf-8') + + # Attempt to parse the file content as JSON + doc_version_ids_list = json.loads(file_content) + log_message(log_level='Debug', + message=f'Amount of documents to be extracted in total: {len(doc_version_ids_list)}', + context=None) + if isinstance(doc_version_ids_list, list): + # Get the first 10,000 document version IDs + batch_size = 10000 + first_batch = doc_version_ids_list[:batch_size] + log_message(log_level='Debug', + message=f'Amount of batched documents to be extracted {len(first_batch)}', + context=None) + remaining_batch = doc_version_ids_list[batch_size:] + log_message(log_level='Debug', + message=f'Amount of remaining documents to be extracted {len(remaining_batch)}', + context=None) + + if remaining_batch: + log_message(log_level='Debug', + message=f'Updating the file: {key}', + context=None) + # If there are remaining IDs, update the file on S3 + remaining_content = json.dumps(remaining_batch) + s3.put_object(Bucket=bucket_name, Key=key, Body=remaining_content) + else: + log_message(log_level='Debug', + message=f'Deleting the file: {key}', + context=None) + # If no IDs are left, delete the file from S3 + s3.delete_object(Bucket=bucket_name, Key=key) + log_message(log_level='Info', + message=f'File {key} deleted successfully', + context=None) + + return first_batch + else: + raise ValueError("The content is not a valid list of document version IDs") + except (s3.exceptions.NoSuchKey, json.JSONDecodeError, ValueError) as e: + raise ValueError(f"Error processing S3 file {file_path}: {e}") + else: + try: + # If it's not an S3 path, assume it's a JSON string + doc_version_ids_list = json.loads(file_path) + if isinstance(doc_version_ids_list, list): + return doc_version_ids_list + else: + raise ValueError("The string is not a valid list of document version IDs") + except json.JSONDecodeError: + raise ValueError(f"The parameter is neither a valid S3 path nor a valid JSON array: {file_path}") + + +def check_file_exists_s3(bucket_name: str, file_key: str) -> bool: + """ + Check if a file exists in an S3 bucket. + + :param bucket_name: The name of the S3 bucket. + :param file_key: The key (path) to the file in the S3 bucket. + :return: True if the file exists, False otherwise. + """ + s3 = boto3.client('s3') + + try: + s3.head_object(Bucket=bucket_name, Key=file_key) + return True + except ClientError as e: + if e.response['Error']['Code'] == '404': + return False + else: + raise e + + class RedshiftConnection: """ This class is a connector to Redshift. diff --git a/common/integrationConfigClass.py b/common/integrationConfigClass.py index aee39d1..4ed88f4 100644 --- a/common/integrationConfigClass.py +++ b/common/integrationConfigClass.py @@ -9,9 +9,10 @@ class IntegrationConfigClass: """ Used to get all the required args from AWS Secrets Manager """ - def __init__(self, region: str): + def __init__(self, region: str, secret_name: str): self._config = None self.region = region + self.secret_name = secret_name @property def config(self): """ @@ -30,10 +31,9 @@ def get_secret(self) -> str: :return: The parsed secrets """ - # Insert the Secret Name of your config file from the AWS secrets manager - secret_name = "direct-data-config.ini" # Create a Secrets Manager client session = boto3.session.Session() + secret_name = self.secret_name client = session.client( service_name='secretsmanager', region_name=self.region diff --git a/common/integrationRequestClass.py b/common/integrationRequestClass.py index eb99952..3ca738a 100644 --- a/common/integrationRequestClass.py +++ b/common/integrationRequestClass.py @@ -24,8 +24,10 @@ def __init__(self, event): self.starting_directory = None self.direct_data_listing_response = None self.target_directory = None - self.source_file = None + self.source_filepath = None self.continue_processing = None + self.doc_version_ids = None + self.secret = None # Map data based on event type if self.is_api_gateway: @@ -57,8 +59,10 @@ def map_api_gateway_data(self): self.starting_directory = self.body.get('starting_directory') self.direct_data_listing_response = self.body.get('direct_data_listing_response') self.target_directory = self.body.get('target_directory') - self.source_file = self.body.get('source_file') + self.source_filepath = self.body.get('source_filepath') self.continue_processing = self.body.get('continue_processing') + self.doc_version_ids = self.body.get('doc_version_ids') + self.secret = self.body.get('secret') # Map other API Gateway specific data as needed def map_direct_lambda_data(self): @@ -67,6 +71,7 @@ def map_direct_lambda_data(self): """ # data = self.event.get('data', {}) + print(self.event) self.step = self.event.get('step') self.extract_type = self.event.get('extract_type') self.start_time = self.event.get('start_time') @@ -74,7 +79,53 @@ def map_direct_lambda_data(self): self.starting_directory = self.event.get('starting_directory') self.direct_data_listing_response = self.event.get('direct_data_listing_response') self.target_directory = self.event.get('target_directory') - self.source_file = self.event.get('source_file') + self.source_filepath = self.event.get('source_filepath') self.continue_processing = self.event.get('continue_processing') + self.doc_version_ids = self.event.get('doc_version_ids') + self.secret = self.event.get('secret') + def get_is_api_gateway(self): + return self.is_api_gateway + + def get_http_method(self): + return self.http_method + + def get_resource(self): + return self.resource + + def get_body(self): + return self.body + + def get_step(self): + return self.step + + def get_extract_type(self): + return self.extract_type + + def get_start_time(self): + return self.start_time + + def get_stop_time(self): + return self.stop_time + + def get_starting_directory(self): + return self.starting_directory + + def get_direct_data_listing_response(self): + return self.direct_data_listing_response + + def get_target_directory(self): + return self.target_directory + + def get_source_filepath(self): + return self.source_filepath + + def get_continue_processing(self): + return self.continue_processing + + def get_doc_version_ids(self): + return self.doc_version_ids + + def get_secret(self): + return self.secret diff --git a/common/redshiftManager.py b/common/redshiftManager.py new file mode 100644 index 0000000..06473fd --- /dev/null +++ b/common/redshiftManager.py @@ -0,0 +1,347 @@ +from typing import Any + +from .integrationConfigClass import IntegrationConfigClass +from .aws_utilities import RedshiftConnection +from .log_message import log_message + + +def update_table_name_that_starts_with_digit(table_name: str) -> str: + """ + This method handles reconciling Vault objects that begin with a number and appending a 'n_' so that Redshift will + accept the nameing convention + :param table_name: The name of the table that needs to be update + :return: The updated table name + """ + if table_name[0].isdigit(): + return f'n_{table_name}' + else: + return table_name + + +class RedshiftManager: + def __init__(self, settings: IntegrationConfigClass, secret): + self.host = settings.config.get(secret, 'redshift_host') + self.dbname = settings.config.get(secret, 'redshift_dbname') + self.user = settings.config.get(secret, 'redshift_user') + self.password = settings.config.get(secret, 'redshift_password') + self.port = settings.config.get(secret, 'redshift_port') + self.iam_role = settings.config.get(secret, 'redshift_iam_redshift_s3_read') + self.redshift_conn = self.get_redshift_connection() + + def get_redshift_connection(self) -> RedshiftConnection: + + return RedshiftConnection( + db_name=self.dbname, + hostname=self.host, + port_number=self.port, + username=self.user, + user_password=self.password + ) + + def redshift_table_exists(self, schema_name: str, table_name: str, settings: IntegrationConfigClass) -> bool: + """ + This method queries a Redshift database and determines if a specified schema and table exists. If the schema does not + exist, the schema will be created + :param settings: Specified Secrets Manager settings file + :param schema_name: The name of the schema where the tables exist + :param table_name: The name of the table that is to be verified + :return: A boolean that signifies whether the table exists or not + """ + + table_name = update_table_name_that_starts_with_digit(table_name) + + query = f""" + SELECT EXISTS ( + SELECT 1 + FROM information_schema.schemata + WHERE + schema_name = '{schema_name}' + ) + """ + try: + schema_exists_result = self.redshift_conn.table_exists_query_execution(query) + if schema_exists_result is False: + log_message(log_level='Debug', + message=f'{schema_name} does not exist. Creating new schema', + context=None) + create_schema_query = f""" + CREATE SCHEMA {schema_name}; + """ + self.redshift_conn.run_query(create_schema_query, False) + return False + + elif schema_exists_result is True: + log_message(log_level='Debug', + message=f'{schema_name} exists. Creating {table_name} in {schema_name} schema', + context=None) + table_exists_query = f""" + SELECT EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE + table_catalog = '{self.dbname}' + AND table_schema = '{schema_name}' + AND table_name = '{table_name}' + ) + """ + try: + table_exists_result = self.redshift_conn.table_exists_query_execution(table_exists_query) + return table_exists_result + except Exception as e: + log_message(log_level='Error', + message=f'Error checking if table {self.dbname}.{schema_name}.{table_name} exists', + exception=e, + context=None) + raise e + except Exception as e: + log_message(log_level='Error', + message=f'Error checking if table {self.dbname}.{schema_name}{table_name} exists', + exception=e, + context=None) + raise e + + def create_redshift_table(self, schema_name: str, table_name: str, column_types: str): + """ + This method creates a new Redhsift table + + :param schema_name: The name of the schema where the table will be located + :param table_name: The name of the new table + :param column_types: A partial SQL string that defines the columns and data types + """ + table_name = update_table_name_that_starts_with_digit(table_name) + log_message(log_level='Debug', + message=f'Creating redshift table "{schema_name}.{table_name}"', + exception=None, + context=None) + try: + + create_query = f"CREATE TABLE {schema_name}.{table_name} ({column_types})" + self.redshift_conn.run_query(create_query, False) + except Exception as e: + log_message(log_level='Error', + message=f'Error creating table {self.dbname}.{schema_name}.{table_name}', + exception=e, + context=None) + raise e + + def add_foreign_key_constraint(self, schema_name: str, table_name: str, columns_and_references: dict[str, str]): + """ + This method alters an existing table by adding a foreign key constraint + + :param schema_name: Name of the schema the table is located + :param table_name: The name of the table that is being altered + :param columns_and_references: A dictionary that maps the name of the column and the referenced table + """ + log_message(log_level='Debug', + message=f'Table name before update {table_name}', + exception=None, + context=None) + table_name = update_table_name_that_starts_with_digit(table_name) + log_message(log_level='Debug', + message=f'Table name after update {table_name}', + exception=None, + context=None) + alter_query = '' + try: + for column, reference in columns_and_references.items(): + updated_reference = update_table_name_that_starts_with_digit(reference.split(".")[1].lower()) + update_column = update_table_name_that_starts_with_digit(column) + alter_query += f""" + ALTER TABLE {schema_name}.{table_name} + ADD CONSTRAINT fk_constraint_{table_name}_{column} + FOREIGN KEY ({update_column}) REFERENCES {schema_name}.{updated_reference}(id); + """ + self.redshift_conn.run_query(alter_query, False) + except Exception as e: + raise e + + def redshift_drop_columns_from_table(self, schema_table_name: str, columns_to_remove: set[str]): + """ + + This method executes an ALTER statement on a table to drop a specified list of columns. + :param schema_table_name: A concatenated string of the schema name and the table name with a "." delimiter + :param columns_to_remove: A set of columns to remove from the specified table + """ + query = '' + + schema_table = schema_table_name.split('.') + + table_name = update_table_name_that_starts_with_digit(schema_table[1]) + + schema_table_name = ".".join([schema_table[0], table_name]) + + if columns_to_remove: + for column in columns_to_remove: + column = update_table_name_that_starts_with_digit(column) + query += f"ALTER TABLE {schema_table_name} DROP COLUMN {column}; " + + self.redshift_conn.run_query(query, False) + + def redshift_update_table(self, schema_name: str, table_name: str, new_column_names: dict[Any, tuple[Any, Any]], + columns_to_drop: set[str]) -> bool: + """ + This method retrieves the current cloumns of a table and compares those to either a list of newly added columns or columns + that should be dropped and updates the table appropriately + :param schema_name: The name of the schema where the talbe is located + :param table_name: The name of the table to be updated + :param new_column_names: A dictionary that maps the new columns to the data types and length of string + :param columns_to_drop: A set of columns that should be dropped from an existing table + """ + + table_name = update_table_name_that_starts_with_digit(table_name) + query = f""" + SELECT column_name + FROM information_schema.columns + WHERE + table_catalog = '{self.dbname}' + AND table_schema = '{schema_name}' + AND table_name = '{table_name}' + """ + redshift_table_name = f"{schema_name}.{table_name}" + try: + current_columns = self.redshift_conn.get_db_column_names(query, False) + new_columns = set(new_column_names.keys()) + added_columns = new_columns - current_columns + removed_columns = set() + if columns_to_drop is not None: + removed_columns = columns_to_drop + if current_columns == new_columns: + log_message(log_level='Info', + message=f'No columns to update for table {redshift_table_name}', + context=None) + return True + else: + redshift_table_name = f"{schema_name}" + if len(added_columns) > 0: + self.redshift_add_columns_to_table(redshift_table_name, + {column: new_column_names[column] for column in added_columns}) + if len(removed_columns) > 0: + self.redshift_drop_columns_from_table(redshift_table_name, removed_columns) + return True + except Exception as e: + log_message(log_level='Error', + message=f'Error updating table {redshift_table_name}', + exception=e, + context=None) + raise e + + def redshift_add_columns_to_table(self, redshift_table_name, columns: dict[Any, tuple[Any, Any]]): + """ + This method adds columns to a specified table + :param redshift_table_name: The name of the table that is having columns added + :param columns: A list of new columns to add + """ + query = f"ALTER TABLE {self.dbname}.{redshift_table_name}" + for column, (data_type, length) in columns.items(): + column = update_table_name_that_starts_with_digit(column) + column_name = '' + if data_type == "id" or (column.lower() == 'id' and data_type == 'string'): + column_name += f'"{column}" VARCHAR({length}) PRIMARY KEY, ' + elif data_type == "datetime": + column_name += f'"{column}" TIMESTAMPTZ, ' + elif data_type == "boolean": + column_name += f'"{column}" BOOLEAN, ' + elif data_type == "number": + column_name += f'"{column}" NUMERIC, ' + elif data_type == "date": + column_name += f'"{column}" DATE, ' + else: + column_name += f'"{column}" VARCHAR({length}), ' + query += f""" + ADD COLUMN {column_name}, + """ + # Remove the trailing comma and whitespace from the last line + query = query.rstrip(", \n") + ";" + self.redshift_conn.run_query(query, False) + + def load_full_data(self, schema_name: str, table_name: str, s3_uri, headers): + """ + This loads a specified CSV file into a specified table + :param schema_name: The name of the schema where the table is located + :param table_name: The name of the table where the data is to be loaded + :param s3_uri: The URI of the CSV in the S3 + :param headers: A string of ordered headers from the CSV + """ + + table_name = update_table_name_that_starts_with_digit(table_name) + if not s3_uri is None: + log_message(log_level='Info', + message=f'Table to be loaded: {schema_name}.{table_name}', + context=None) + query = f"COPY {self.dbname}.{schema_name}.{table_name} ({headers}) FROM '{s3_uri}' " \ + f"IAM_ROLE '{self.iam_role}' " \ + f"FORMAT AS CSV " \ + f"QUOTE '\"' " \ + f"IGNOREHEADER 1 " \ + f"TIMEFORMAT 'auto'" \ + f"ACCEPTINVCHARS " \ + f"FILLRECORD" + try: + self.redshift_conn.run_query(query, False) + return True + except Exception as e: + + log_message(log_level='Error', + message=f'Error loading{schema_name}.{table_name}', + exception=e, + context=None) + raise e + else: + log_message(log_level='Info', + message=f'Load operation for {schema_name}.{table_name} is skipped', + exception=None, context=None) + return True + + def delete_data(self, schema_name: str, table_name: str, s3_file_uri: str): + """ + This method deletes data from a specified table + + :param schema_name: The name of the schema where the table is located + :param table_name: The name of the table that is having data deleted from + :param s3_file_uri: The URI of the deletes file in S3 + """ + + table_name = update_table_name_that_starts_with_digit(table_name) + try: + # create a temporary table to hold the data from the delete file + columns = '' + column_names = '' + if table_name == 'picklist__sys': + column_names += 'object || object_field || picklist_value_name' + columns += 'object VARCHAR(255), object_field VARCHAR(255), picklist_value_name VARCHAR(255)' + elif table_name == 'metadata': + column_names += 'extract || column_name' + columns += 'extract VARCHAR(255), column_name VARCHAR(255)' + else: + column_names += 'id' + columns += 'id VARCHAR(255)' + create_query = f"CREATE TEMPORARY TABLE temp_{table_name}_deletes ({columns}, deleted_date TIMESTAMPTZ)" + self.redshift_conn.run_query(create_query, True) + + # load the data from the _deletes.csv file into the temporary table + copy_query = f""" + COPY temp_{table_name}_deletes FROM '{s3_file_uri}' + IAM_ROLE '{self.iam_role}' + FORMAT AS CSV + QUOTE '\"' + IGNOREHEADER 1 + TIMEFORMAT 'auto' + """ + self.redshift_conn.run_query(copy_query, True) + # delete the matching rows from the target table + delete_query = f"DELETE FROM {self.dbname}.{schema_name}.{table_name} WHERE {column_names} IN (SELECT {column_names} FROM temp_{table_name}_deletes);" + self.redshift_conn.run_query(delete_query, False) + except Exception as e: + log_message(log_level='Error', + message=f'Something went wrong when attempting to delete the data.', + exception=e, + context=None) + raise e + + def drop_table(self, schema_name, table_name): + + table_name = update_table_name_that_starts_with_digit(table_name) + + drop_table_query = f"DROP TABLE {self.dbname}.{schema_name}.{table_name};" + + self.redshift_conn.run_query(drop_table_query, False) diff --git a/common/redshift_setup.py b/common/redshift_setup.py deleted file mode 100644 index e1c197a..0000000 --- a/common/redshift_setup.py +++ /dev/null @@ -1,331 +0,0 @@ -from typing import List, Dict, Any - -from .integrationConfigClass import IntegrationConfigClass -import boto3 -from .aws_utilities import RedshiftConnection -from .log_message import log_message - -current_region = str(boto3.Session().region_name) -settings = IntegrationConfigClass(current_region) - -# Get the Redshift configuration values -host = settings.config.get('redshift', 'host') -dbname = settings.config.get('redshift', 'dbname') -user = settings.config.get('redshift', 'user') -password = settings.config.get('redshift', 'password') -port = settings.config.get('redshift', 'port') -# schema = settings.config.get('redshift', 'schema') -year = settings.config.get('system', 'year') -# catalog = settings.config.get('redshift', 'catalog') - -# Create a Redshift connection -redshift_conn: RedshiftConnection = RedshiftConnection( - db_name=dbname, - hostname=host, - port_number=port, - username=user, - user_password=password -) - - -def redshift_table_exists(schema_name: str, table_name: str) -> bool: - """ - This method queries a Redshift database and determines if a specified schema and table exists. If the schema does not - exist, the schema will be created - :param schema_name: The name of the schema where the tables exist - :param table_name: The name of the table that is to be verified - :return: A boolean that signifies whether the table exists or not - """ - - query = f""" - SELECT EXISTS ( - SELECT 1 - FROM information_schema.schemata - WHERE - schema_name = '{schema_name}' - ) - """ - try: - schema_exists_result = redshift_conn.table_exists_query_execution(query) - if schema_exists_result is False: - log_message(log_level='Debug', - message=f'{schema_name} does not exist. Creating new schema', - context=None) - create_schema_query = f""" - CREATE SCHEMA {schema_name}; - """ - redshift_conn.run_query(create_schema_query, False) - return False - - elif schema_exists_result is True: - log_message(log_level='Debug', - message=f'{schema_name} exists. Creating {table_name} in {schema_name} schema', - context=None) - table_exists_query = f""" - SELECT EXISTS ( - SELECT 1 - FROM information_schema.tables - WHERE - table_catalog = '{dbname}' - AND table_schema = '{schema_name}' - AND table_name = '{table_name}' - ) - """ - try: - table_exists_result = redshift_conn.table_exists_query_execution(table_exists_query) - return table_exists_result - except Exception as e: - log_message(log_level='Error', - message=f'Error checking if table {dbname}.{schema_name}.{table_name} exists', - exception=e, - context=None) - raise e - except Exception as e: - log_message(log_level='Error', - message=f'Error checking if table {dbname}.{schema_name}{table_name} exists', - exception=e, - context=None) - raise e - - -def create_redshift_table(schema_name: str, table_name: str, column_types: str): - """ - This method creates a new Redhsift table - - :param schema_name: The name of the schema where the table will be located - :param table_name: The name of the new table - :param column_types: A partial SQL string that defines the columns and data types - """ - log_message(log_level='Debug', - message=f'Creating redshift table "{schema_name}.{table_name}"', - exception=None, - context=None) - try: - create_query = f"CREATE TABLE {schema_name}.{table_name} ({column_types})" - redshift_conn.run_query(create_query, False) - except Exception as e: - log_message(log_level='Error', - message=f'Error creating table {dbname}.{schema_name}.{table_name}', - exception=e, - context=None) - raise e - - -def add_foreign_key_constraint(schema_name: str, table_name: str, columns_and_references: dict[str, str]): - """ - This method alters an existing table by adding a foreign key constraint - - :param schema_name: Name of the schema the table is located - :param table_name: The name of the table that is being altered - :param columns_and_references: A dictionary that maps the name of the column and the referenced table - """ - alter_query = '' - try: - for column, reference in columns_and_references.items(): - alter_query += f""" - ALTER TABLE {schema_name}.{table_name} - ADD CONSTRAINT fk_constraint_{table_name}_{column} - FOREIGN KEY ({column}) REFERENCES {schema_name}.{reference.split(".")[1].lower()}(id); - """ - redshift_conn.run_query(alter_query, False) - except Exception as e: - raise e - - -def redshift_drop_columns_from_table(schema_table_name: str, columns_to_remove: set[str]): - """ - - This method executes an ALTER statement on a table to drop a specified list of columns. - :param schema_table_name: A concatenated string of the schema name and the table name with a "." delimiter - :param columns_to_remove: A set of columns to remove from the specified table - """ - query = '' - - if columns_to_remove: - for column in columns_to_remove: - query += f"ALTER TABLE {schema_table_name} DROP COLUMN {column}; " - - redshift_conn.run_query(query, False) - - -def redshift_update_table(schema_name: str, table_name: str, new_column_names: dict[Any, tuple[Any, Any]], - columns_to_drop: set[str]) -> bool: - """ - This method retrieves the current cloumns of a table and compares those to either a list of newly added columns or columns - that should be dropped and updates the table appropriately - :param schema_name: The name of the schema where the talbe is located - :param table_name: The name of the table to be updated - :param new_column_names: A dictionary that maps the new columns to the data types and length of string - :param columns_to_drop: A set of columns that should be dropped from an existing table - """ - - query = f""" - SELECT column_name - FROM information_schema.columns - WHERE - table_catalog = '{dbname}' - AND table_schema = '{schema_name}' - AND table_name = '{table_name}' - """ - redshift_table_name = f"{schema_name}.{table_name}" - try: - current_columns = redshift_conn.get_db_column_names(query, False) - new_columns = set(new_column_names.keys()) - added_columns = new_columns - current_columns - removed_columns = set() - if columns_to_drop is not None: - removed_columns = columns_to_drop - if current_columns == new_columns: - log_message(log_level='Info', - message=f'No columns to update for table {redshift_table_name}', - context=None) - return True - else: - redshift_table_name = f"{schema_name}" - if len(added_columns) > 0: - redshift_add_columns_to_table(redshift_table_name, - {column: new_column_names[column] for column in added_columns}) - if len(removed_columns) > 0: - redshift_drop_columns_from_table(redshift_table_name, removed_columns) - return True - except Exception as e: - log_message(log_level='Error', - message=f'Error updating table {redshift_table_name}', - exception=e, - context=None) - raise e - - -def redshift_add_columns_to_table(redshift_table_name, columns: dict[Any, tuple[Any, Any]]): - """ - This method adds columns to a specified table - :param redshift_table_name: The name of the table that is having columns added - :param columns: A list of new columns to add - """ - query = f"ALTER TABLE {dbname}.{redshift_table_name}" - for column, (data_type, length) in columns.items(): - column_name = '' - if data_type == "id" or (column.lower() == 'id' and data_type == 'string'): - column_name += f'"{column}" VARCHAR({length}) PRIMARY KEY, ' - elif data_type == "datetime": - column_name += f'"{column}" TIMESTAMPTZ, ' - elif data_type == "boolean": - column_name += f'"{column}" BOOLEAN, ' - elif data_type == "number": - column_name += f'"{column}" NUMERIC, ' - elif data_type == "date": - column_name += f'"{column}" DATE, ' - else: - column_name += f'"{column}" VARCHAR({length}), ' - query += f""" - ADD COLUMN {column_name}, - """ - # Remove the trailing comma and whitespace from the last line - query = query.rstrip(", \n") + ";" - redshift_conn.run_query(query, False) - - -def get_s3_path(filename, s3_bucket, subfolder): - """ - - :param filename: The name of the file to locate - :param s3_bucket: The name of the S3 bucket - :param subfolder: The directory the file is located - :param file_type: full, updates or deletes. Depending on these choices, the file key is searched. - :return: - """ - try: - s3 = boto3.resource('s3') - bucket = s3.Bucket(s3_bucket) - prefix = f"{subfolder}/" - for obj in bucket.objects.filter(Prefix=prefix): - if filename in obj.key: - return f"s3://{s3_bucket}/{obj.key}" - log_message(log_level='Info', - message=f'For {filename}, s3 file not found', - exception=None, context=None) - except Exception as e: - raise e - return None - - -def load_full_data(schema_name: str, table_name: str, s3_uri, headers): - """ - This loads a specified CSV file into a specified table - :param schema_name: The name of the schema where the table is located - :param table_name: The name of the table where the data is to be loaded - :param s3_uri: The URI of the CSV in the S3 - :param headers: A string of ordered headers from the CSV - """ - if not s3_uri is None: - log_message(log_level='Info', - message=f'Table to be loaded: {schema_name}.{table_name}', - context=None) - query = f"COPY {dbname}.{schema_name}.{table_name} ({headers}) FROM '{s3_uri}' " \ - f"IAM_ROLE '{settings.config.get('redshift', 'iam_redshift_s3_read')}' " \ - f"FORMAT AS CSV " \ - f"QUOTE '\"' " \ - f"IGNOREHEADER 1 " \ - f"TIMEFORMAT 'auto'" \ - f"ACCEPTINVCHARS " \ - f"FILLRECORD" - try: - redshift_conn.run_query(query, False) - return True - except Exception as e: - - log_message(log_level='Error', - message=f'{schema_name}.{table_name}', - exception=e, - context=None) - raise e - else: - log_message(log_level='Info', - message=f'Load operation for {schema_name}.{table_name} is skipped', - exception=None, context=None) - return True - - -def delete_data(schema_name: str, table_name: str, s3_file_uri: str): - """ - This method deletes data from a specified table - - :param schema_name: The name of the schema where the table is located - :param table_name: The name of the table that is having data deleted from - :param s3_file_uri: The URI of the deletes file in S3 - """ - try: - # create a temporary table to hold the data from the delete file - columns = '' - column_names = '' - if table_name == 'picklist__sys': - column_names += 'object || object_field || picklist_value_name' - columns += 'object VARCHAR(255), object_field VARCHAR(255), picklist_value_name VARCHAR(255)' - elif table_name == 'metadata': - column_names += 'extract || column_name' - columns += 'extract VARCHAR(255), column_name VARCHAR(255)' - else: - column_names += 'id' - columns += 'id VARCHAR(255)' - create_query = f"CREATE TEMPORARY TABLE temp_{table_name}_deletes ({columns}, deleted_date TIMESTAMPTZ)" - redshift_conn.run_query(create_query, True) - - # load the data from the _deletes.csv file into the temporary table - copy_query = f""" - COPY temp_{table_name}_deletes FROM '{s3_file_uri}' - IAM_ROLE '{settings.config.get('redshift', 'iam_redshift_s3_read')}' - FORMAT AS CSV - QUOTE '\"' - IGNOREHEADER 1 - TIMEFORMAT 'auto' - """ - redshift_conn.run_query(copy_query, True) - # delete the matching rows from the target table - delete_query = f"DELETE FROM {dbname}.{schema_name}.{table_name} WHERE {column_names} IN (SELECT {column_names} FROM temp_{table_name}_deletes);" - redshift_conn.run_query(delete_query, False) - except Exception as e: - log_message(log_level='Error', - message=f'Something went wrong when attempting to delete the data.', - exception=e, - context=None) - raise e diff --git a/common/direct_data_files_interface.py b/common/vault/direct_data_files_interface.py similarity index 51% rename from common/direct_data_files_interface.py rename to common/vault/direct_data_files_interface.py index 36b0772..ee10cff 100644 --- a/common/direct_data_files_interface.py +++ b/common/vault/direct_data_files_interface.py @@ -1,43 +1,133 @@ import gzip +import io +import json import math +import sys +import time import tarfile import csv +import urllib.parse +from pathlib import Path from io import BytesIO, StringIO from typing import Dict, List, Any import boto3 import pandas as pd -from botocore.client import BaseClient from botocore.exceptions import ClientError -from .api.client.vault_client import VaultClient, AuthenticationType, AuthenticationResponse -from .api.model.response.direct_data_response import DirectDataResponse -from .api.model.response.vault_response import VaultResponse -from .api.request.direct_data_request import DirectDataRequest, ExtractType +from common.aws_utilities import start_batch_job, upload_large_file +from common.integrationConfigClass import IntegrationConfigClass +from common.aws_utilities import RedshiftConnection -from .log_message import log_message -from .integrationConfigClass import IntegrationConfigClass -from .redshift_setup import redshift_table_exists, create_redshift_table, redshift_update_table, load_full_data, \ - delete_data, add_foreign_key_constraint -from .responseClass import ResponseClass +from common.log_message import log_message +from common.redshiftManager import RedshiftManager, update_table_name_that_starts_with_digit -def load_data_into_redshift(schema_name: str, tables_to_load: Dict[str, str], starting_directory: str, s3_bucket: str): +def load_data_into_redshift(schema_name: str, tables_to_load: Dict[str, str], starting_directory: str, s3_bucket: str, + extract_docs: bool, settings: IntegrationConfigClass, redshift_manager: RedshiftManager, + secret: str): """ This method defines the S3 URI of the CSV files and retrieves the CSV headers in the order the columns in the file. + :param secret: + :param redshift_manager: + :param extract_docs: + :param settings: :param schema_name: Name of the Redshift schema :param tables_to_load: A dictionary that maps the table name to the related CSV file :param starting_directory: The starting directory where the of where the direct data file is located :param s3_bucket: The name of the S3 bucket that the direct data files is stored. """ + # List to store asynchronous tasks (if needed) + for table in tables_to_load: csv_file = tables_to_load.get(table) table_s3_uri = f"s3://{s3_bucket}/{starting_directory}/{csv_file}" + table_name = table.split(".")[1] + + if table_name == 'document_version__sys' and extract_docs: + # Example of starting async task (if needed) + retrieve_document_source_content_async(s3_bucket, starting_directory, csv_file, + settings) + + try: + # Load data into Redshift (assuming this operation is synchronous) + redshift_manager.load_full_data(schema_name=schema_name, + table_name=table_name.lower(), + s3_uri=table_s3_uri, + headers=get_csv_headers(s3_bucket, starting_directory, csv_file)) + + except Exception as e: + # Handle any exceptions that occur during the execution of load_data_into_redshift + log_message(log_level='Error', + message=f'Error loading {schema_name}.{table_name}', + exception=e, + context=None) + raise e + + +def retrieve_document_source_content_async(bucket_name, starting_directory, csv_location, settings, secret): + s3 = boto3.client('s3') + + doc_version_ids = retrieve_version_ids(bucket_name, starting_directory, + csv_location) + + doc_version_ids_body_param = json.dumps(doc_version_ids) + body_param_io = io.StringIO(doc_version_ids_body_param) + s3_key = f'{starting_directory}/doc_version_ids.txt' + s3.put_object(Bucket=bucket_name, Key=s3_key, Body=body_param_io.getvalue()) + doc_version_ids_body_param = f's3://{bucket_name}/{s3_key}' + + job_name = settings.config.get(secret, 'job_name') + job_queue = settings.config.get(secret, 'job_queue') + job_definition = settings.config.get(secret, 'job_definition') + job_parameter: Dict[str, str] = {'step': 'extract_docs', + 'source_filepath': f'{starting_directory}/source_docs', + 'extract_type': 'incremental', + 'doc_version_ids': f'{doc_version_ids_body_param}'} + + batch_job_response = start_batch_job(job_name=f'{job_name}-export', job_queue=job_queue, + job_definition=job_definition, + job_parameters=job_parameter) + if 'jobId' in batch_job_response: + log_message(log_level='Info', + message=f"Job started successfully with Job ID: {batch_job_response['jobId']}", + context=None) + else: + log_message(log_level='Error', + message=f'Failed to start job: {batch_job_response}', + context=None) + + +def batch_list(data, batch_size): + """Yield successive n-sized chunks from a list.""" + for i in range(0, len(data), batch_size): + yield data[i:i + batch_size] + - load_full_data(schema_name=schema_name, - table_name=table.split(".")[1].lower(), - s3_uri=table_s3_uri, - headers=get_csv_headers(s3_bucket, starting_directory, csv_file)) +def retrieve_version_ids(bucket_name, starting_directory, csv_location): + log_message(log_level='Info', + message=f'Retrieving document version IDs', + context=None) + s3 = boto3.client('s3') + response = s3.get_object(Bucket=bucket_name, Key=f'{starting_directory}/{csv_location}') + csv_data = response['Body'].read().decode('utf-8') + csv_reader = csv.reader(StringIO(csv_data)) + headers = next(csv_reader) + column_values = [] + + column_name = 'version_id' + try: + column_index = headers.index(column_name) + except ValueError: + raise ValueError(f"Column '{column_name}' not found in CSV headers: {headers}") + + for row in csv_reader: + if len(row) > column_index: + column_values.append(row[column_index]) + else: + column_values.append(None) + + return column_values def get_csv_headers(bucket_name, starting_directory, csv_location): @@ -52,28 +142,45 @@ def get_csv_headers(bucket_name, starting_directory, csv_location): log_message(log_level='Info', message=f'Retrieving CSV headers for {starting_directory}/{csv_location}', context=None) + s3 = boto3.client('s3') response = s3.get_object(Bucket=bucket_name, Key=f'{starting_directory}/{csv_location}') - csv_data = response['Body'].read().decode('utf-8') - csv_reader = csv.reader(StringIO(csv_data)) - headers = next(csv_reader) - # Joining headers into a string with ', ' delimiter - headers_str = ', '.join(headers) + try: + with io.TextIOWrapper(response['Body'], encoding='utf-8') as file: + csv_reader = csv.reader(file) + headers = next(csv_reader) # Read the first line containing headers + + updated_headers = [update_table_name_that_starts_with_digit(header) for header in headers] + headers_str = ', '.join(updated_headers) - return headers_str + return headers_str + except csv.Error as e: + log_message(log_level='Error', + message=f'Error reading CSV file: {e}', + exception=e, + context=None) + return None + except StopIteration: + log_message(log_level='Error', + message='CSV file appears to be empty or corrupted', + context=None) + return None def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str, metadata_path: str, starting_directory: str, extract_type: str, metadata_deletes_filepath: str, - schema_name: str): + schema_name: str, extract_docs: bool, settings: IntegrationConfigClass, secret: str): """ This method creates the Metadata table if it doesn't already exists. It then reads the manifest file and determines whether a table should be created or updated. Then it will determine if data for the specified tables in the manifest file needs data loaded or deleted. + :param secret: The specified configured secret within the setting file + :param settings: The Secrets Manager settings file + :param extract_docs: Extract document source content or not. :param chunk_size: The size of how much the manifest data frame should be chunked - :param bucket_name: Name of the S3 bukcet where the Direct Data files are located + :param bucket_name: Name of the S3 bucket where the Direct Data files are located :param manifest_path: The file path of the manifest file :param metadata_path: The file path of the metadata file :param starting_directory: The directory of the unzipped Direct Data file @@ -81,11 +188,45 @@ def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str :param metadata_deletes_filepath: The file path of the manifest file that contains deletes :param schema_name: The name of the schema where the tables should be created or updated """ + + log_message(log_level='Info', + message=f'The metadata: {metadata_path} and manifest: {manifest_path}', + exception=None, + context=None) + try: + + # This class handles forming queries and establishing redshift connections + redshift_manager: RedshiftManager = RedshiftManager(settings=settings, secret=secret) + + # Initialize dataframes that will allow the parsing of the appropriate manifest and metadata CSV files. manifest_dataframe_itr = pd.read_csv(manifest_path, chunksize=chunk_size) metadata_dataframe_itr = pd.read_csv(metadata_path) metadata_deletes_dataframe_itr = None + + # Current bug in Direct Data extract where the document_number__v field is defined as a number instead of a + # string for document_version__sys elements. This makes an update to the metadata dataframe to reconcile that + # bug. + metadata_dataframe_itr.loc[ + (metadata_dataframe_itr['extract'] == 'Document.document_version__sys') & + (metadata_dataframe_itr['column_name'] == 'document_number__v'), + ['type', 'length'] + ] = ['String', 255] + + # Current bug in Direct Data extract where the description__sys field is defined as allowing a length of 128 + # characters. Vault allows 255 characters for that field. This makes an update to the metadata dataframe to + # reconcile that bug. + metadata_dataframe_itr.loc[ + (metadata_dataframe_itr['extract'] == 'Object.security_policy__sys') & + (metadata_dataframe_itr['column_name'] == 'description__sys'), + ['type', 'length'] + ] = ['String', 255] + if metadata_deletes_filepath is not None and len(metadata_deletes_filepath) > 0: + log_message(log_level='Info', + message=f'The metadata_deletes file exists', + exception=None, + context=None) metadata_deletes_dataframe_itr = pd.read_csv(metadata_deletes_filepath) except Exception as e: @@ -99,26 +240,38 @@ def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str tables_to_delete: Dict[str, str] = {} tables_to_verify: List[str] = [] tables_to_create: List[str] = [] + # If this a full extract, the assumption is that the database is being established and the metadata table needs + # to be created first. if extract_type == "full": - load_metadata_table(schema_name=schema_name, metadata_dataframe=metadata_dataframe_itr) - # Process each chunk + load_metadata_table(schema_name=schema_name, + metadata_dataframe=metadata_dataframe_itr, + redshift_manager=redshift_manager, + settings=settings) + # Process each chunked data from the manifest dataframe for chunk in manifest_dataframe_itr: for index, row in chunk.iterrows(): type = row["type"] + # Only process elements that have records present record_count_not_zero = row["records"] > 0 full_table_name: str = row["extract"] if extract_type == 'full' or record_count_not_zero: file_path = row["file"] - if redshift_table_exists(schema_name=schema_name, table_name=full_table_name.split(".")[1].lower()): - if record_count_not_zero: - tables_to_verify.append(full_table_name) - else: - tables_to_create.append(full_table_name) - if type == "updates": - if record_count_not_zero: - tables_to_load[full_table_name] = file_path - elif type == 'deletes': - tables_to_delete[full_table_name] = file_path + table_name = full_table_name.split(".")[1].lower() + if table_name[0].isdigit(): + table_name = 'n_' + table_name + if not (extract_type == "full" and table_name == 'metadata'): + # Check to see if the schema and table exists. + if redshift_manager.redshift_table_exists(schema_name=schema_name, table_name=table_name, + settings=settings): + if record_count_not_zero: + tables_to_verify.append(full_table_name) + else: + tables_to_create.append(full_table_name) + if type == "updates": + if record_count_not_zero: + tables_to_load[full_table_name] = file_path + elif type == 'deletes': + tables_to_delete[full_table_name] = file_path if len(tables_to_create) > 0: log_message(log_level='Info', @@ -126,7 +279,8 @@ def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str context=None) create_new_redshift_tables(table_names=tables_to_create, schema_name=schema_name, - metadata_dataframe=metadata_dataframe_itr) + metadata_dataframe=metadata_dataframe_itr, + redshift_manager=redshift_manager) if len(tables_to_verify) > 0: log_message(log_level='Info', message=f'Updating existing tables', @@ -134,7 +288,8 @@ def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str verify_and_update_existing_tables(table_names=tables_to_verify, metadata_dataframe=metadata_dataframe_itr, metadata_deletes_dataframe=metadata_deletes_dataframe_itr, - schema_name=schema_name) + schema_name=schema_name, + redshift_manager=redshift_manager) if len(tables_to_load) > 0: log_message(log_level='Info', message=f'Loading data', @@ -142,15 +297,24 @@ def verify_redshift_tables(chunk_size: int, bucket_name: str, manifest_path: str load_data_into_redshift(schema_name=schema_name, tables_to_load=tables_to_load, starting_directory=starting_directory, - s3_bucket=bucket_name) + s3_bucket=bucket_name, + extract_docs=extract_docs, + settings=settings, + redshift_manager=redshift_manager, + secret=secret) if len(tables_to_delete) > 0: log_message(log_level='Info', message=f'Deleting data from existing tables', context=None) - delete_data_from_redshift_table(tables_to_delete, starting_directory, bucket_name) + delete_data_from_redshift_table(schema_name=schema_name, + table_names=tables_to_delete, + starting_directory=starting_directory, + s3_bucket=bucket_name, + redshift_manager=redshift_manager) -def load_metadata_table(schema_name: str, metadata_dataframe: pd.DataFrame): +def load_metadata_table(schema_name: str, metadata_dataframe: pd.DataFrame, redshift_manager: RedshiftManager, + settings: IntegrationConfigClass): """ This method creates or updates the Metadata table. First it checks to see if the table exists. If it does then it will update the columns of the table, if not it will create the table in the specified schema @@ -164,46 +328,90 @@ def load_metadata_table(schema_name: str, metadata_dataframe: pd.DataFrame): column_type_length.update({column: ("string", 1000)}) table_name = 'metadata' - if redshift_table_exists(schema_name=schema_name, table_name=table_name): - redshift_update_table(schema_name=schema_name, - table_name=table_name, - new_column_names=column_type_length, - columns_to_drop=set()) + if redshift_manager.redshift_table_exists(schema_name=schema_name, table_name=table_name, settings=settings): + redshift_manager.redshift_update_table(schema_name=schema_name, + table_name=table_name, + new_column_names=column_type_length, + columns_to_drop=set()) else: - create_redshift_table(schema_name=schema_name, - table_name=table_name, - column_types=create_sql_str(column_type_length, False)) + redshift_manager.create_redshift_table(schema_name=schema_name, + table_name=table_name, + column_types=create_sql_str(column_type_length, False)) def verify_and_update_existing_tables(table_names: List[str], metadata_dataframe: pd.DataFrame, - metadata_deletes_dataframe: pd.DataFrame, schema_name: str): + metadata_deletes_dataframe: pd.DataFrame, schema_name: str, + redshift_manager: RedshiftManager): """ This method verifies that the tables from the manifest files are listed in the metadata file. It then retrieves the columns listed in the metadata file, if the metadata_deletes file exists then it lists the columns that need to be removed. The method then pass the list of new and + :param redshift_manager: :param table_names: A list of tables to verify and update :param metadata_dataframe: The metadata dataframe to parse :param metadata_deletes_dataframe: the metadata_deletes dataframe to parse :param schema_name: The name of the schema the tables are located in :return: """ + log_message(log_level='Debug', + message=f'Start of updating the tables', + exception=None, + context=None) columns_to_add = {} columns_to_remove = [] - for table in table_names: - if is_table_in_extract(metadata_dataframe, table): - # columns: List[str] = metadata_dataframe.loc[table, ["column_name"]] - columns_to_add: dict[Any, tuple[Any, Any]] = retrieve_table_columns_types_and_lengths(table, - metadata_dataframe) + tables_dropped = [] - if metadata_deletes_dataframe is not None and is_table_in_extract(metadata_deletes_dataframe, table): - columns_to_remove = retrieve_columns_to_drop(table, metadata_deletes_dataframe) - redshift_update_table(schema_name=schema_name, - table_name=table.split(".")[1].lower(), - new_column_names=columns_to_add, - columns_to_drop=columns_to_remove) + try: + log_message(log_level='Debug', + message=f'Checking if metadata_deletes exists', + exception=None, + context=None) + if metadata_deletes_dataframe is not None: + tables_dropped = drop_table_for_deleted_objects(schema_name, table_names, metadata_deletes_dataframe) + log_message(log_level='Debug', + message=f'Tables dropped: {tables_dropped}', + exception=None, + context=None) + log_message(log_level='Debug', + message=f'Iterating through all the tables', + exception=None, + context=None) + for table in table_names: + log_message(log_level='Debug', + message=f'Checking if {table} is in the extract', + exception=None, + context=None) + if is_table_in_extract(metadata_dataframe, table): + # columns: List[str] = metadata_dataframe.loc[table, ["column_name"]] + columns_to_add: dict[Any, tuple[Any, Any]] = retrieve_table_columns_types_and_lengths(table, + metadata_dataframe) + log_message(log_level='Debug', + message=f'Checking if metadata_deletes exists again', + exception=None, + context=None) + if metadata_deletes_dataframe is not None: + if len(tables_dropped) == 0 or table not in tables_dropped: + columns_to_remove = retrieve_columns_to_drop(table, schema_name, metadata_deletes_dataframe) + + if columns_to_remove: + log_message(log_level='Debug', + message=f'Updating the tables', + exception=None, + context=None) + redshift_manager.redshift_update_table(schema_name=schema_name, + table_name=table.split(".")[1].lower(), + new_column_names=columns_to_add, + columns_to_drop=set(columns_to_remove)) + except Exception as e: + log_message(log_level='Error', + message=f'There was an issue updating the tables.', + exception=e, + context=None) + raise e -def create_new_redshift_tables(table_names: List[str], schema_name: str, metadata_dataframe: pd.DataFrame): +def create_new_redshift_tables(table_names: List[str], schema_name: str, metadata_dataframe: pd.DataFrame, + redshift_manager: RedshiftManager): """ This method creates new Redshift tables. Once the tables are created, @@ -221,34 +429,59 @@ def create_new_redshift_tables(table_names: List[str], schema_name: str, metadat relation_alter_table_and_columns[table] = retrieve_table_reference_columns(table, metadata_dataframe) if not table == 'Picklist.picklist__sys': - create_redshift_table(schema_name=schema_name, - table_name=table.split(".")[1].lower(), - column_types=create_sql_str(creation_columns, False)) + redshift_manager.create_redshift_table(schema_name=schema_name, + table_name=table.split(".")[1].lower(), + column_types=create_sql_str(creation_columns, False)) else: - create_redshift_table(schema_name=schema_name, - table_name=table.split(".")[1].lower(), - column_types=create_sql_str(creation_columns, True)) + redshift_manager.create_redshift_table(schema_name=schema_name, + table_name=table.split(".")[1].lower(), + column_types=create_sql_str(creation_columns, True)) update_reference_columns(table_to_column_dict=relation_alter_table_and_columns, - schema_name=schema_name) + schema_name=schema_name, redshift_manager=redshift_manager) + + +def drop_table_for_deleted_objects(schema_name: str, metadata_deletes_dataframe: pd.DataFrame, + redshift_manager: RedshiftManager): + try: + filtered_rows = metadata_deletes_dataframe[metadata_deletes_dataframe['column_name'] == 'id'] + + unique_tables_to_drop = filtered_rows['extract'].unique() + + processed_tables = set() + + for table_to_drop in unique_tables_to_drop: + if table_to_drop not in processed_tables: + redshift_manager.drop_table(schema_name, table_to_drop.split(".")[1]) + processed_tables.add(table_to_drop) + + return list(processed_tables) + + + except Exception as e: + log_message(log_level='Error', + message=f'Error when searching for tables to drop', + exception=e, + context=None) + raise e def retrieve_columns_to_drop(table: str, metadata_deletes_dataframe: pd.DataFrame): """ This method retrieves a list of columns from the metadata_delete that need to be removed from the table + :param schema_name: The name of the database schema :param table: The name of the table that requires the columns to be dropped :param metadata_deletes_dataframe: The metadata_deletes dataframe :return: A list of columns to drop from the specified table """ try: - creation_filtered_rows = metadata_deletes_dataframe[metadata_deletes_dataframe['extract'] == table] - columns_to_drop = creation_filtered_rows['column_name'].values + if is_table_in_extract(metadata_deletes_dataframe, table): + creation_filtered_rows = metadata_deletes_dataframe[metadata_deletes_dataframe['extract'] == table] + return creation_filtered_rows['column_name'].values except Exception as e: raise e - return set(columns_to_drop) - def retrieve_table_columns_types_and_lengths(table: str, metadata_dataframe: pd.DataFrame): """ @@ -290,26 +523,29 @@ def retrieve_table_reference_columns(table: str, metadata_dataframe: pd.DataFram return dict(zip(columns_and_references_array[:, 0], columns_and_references_array[:, 1])) -def update_reference_columns(table_to_column_dict: dict[str, dict[str, str]], schema_name: str): +def update_reference_columns(table_to_column_dict: dict[str, dict[str, str]], schema_name: str, + redshift_manager: RedshiftManager): """ This method loops through the input dictionary and if the reference columns exists, a foreign key is added to the table. + :param redshift_manager: :param table_to_column_dict: :param schema_name: :return: """ for table, column_and_references in table_to_column_dict.items(): if bool(column_and_references): - add_foreign_key_constraint(schema_name=schema_name, - table_name=table.split(".")[1], - columns_and_references=column_and_references) + redshift_manager.add_foreign_key_constraint(schema_name=schema_name, + table_name=table.split(".")[1], + columns_and_references=column_and_references) def delete_data_from_redshift_table(schema_name: str, table_names: Dict[str, str], starting_directory: str, - s3_bucket: str): + s3_bucket: str, redshift_manager: RedshiftManager): """ This method iterates through each table ame and determines the S3 bucket location of the CSV file of the data to delete from the table. + :param redshift_manager: :param schema_name: The name of the schema where the table resides :param table_names: A dictionary that maps the table to the CSV file location and name :param starting_directory: The starting directory of the Direct Data file @@ -317,17 +553,14 @@ def delete_data_from_redshift_table(schema_name: str, table_names: Dict[str, str """ try: for table, file in table_names.items(): - # table_column_condition = is_table_in_extract(metadata_dataframe, table) - # if is_table_in_extract(metadata_dataframe, table): - # columns_and_types: Dict[str, str] = metadata_dataframe.loc[ - # metadata_dataframe['extract'] == table, ["column_name", "type"]] - table_s3_uri = f"s3://{s3_bucket}/{starting_directory}/{file}" - log_message(log_level='Debug', - message=f'Delete file: {table_s3_uri} for table: {table}', - context=None) - delete_data(schema_name=schema_name, - table_name=table.split(".")[1].lower(), - s3_file_uri=table_s3_uri) + if redshift_manager.redshift_table_exists(schema_name=schema_name, table_name=table.split(".")[1].lower()): + table_s3_uri = f"s3://{s3_bucket}/{starting_directory}/{file}" + log_message(log_level='Debug', + message=f'Delete file: {table_s3_uri} for table: {table}', + context=None) + redshift_manager.delete_data(schema_name=schema_name, + table_name=table.split(".")[1].lower(), + s3_file_uri=table_s3_uri) except Exception as e: log_message(log_level='Error', message=f'Error encountered when attempting to delete data', @@ -367,10 +600,12 @@ def create_sql_str(fields_dict: dict[str, tuple[str, int]], is_picklist: bool) - data_type_length = data_type_tuple[1] if math.isnan(data_type_length): - data_type_length = 255 + data_type_length = 17000 else: data_type_length = int(data_type_length) + k = update_table_name_that_starts_with_digit(k) + if data_type == "id" or (k.lower() == 'id' and data_type == 'string'): sql_str += f'"{k}" VARCHAR({data_type_length}) PRIMARY KEY, ' elif data_type == "datetime": @@ -429,7 +664,10 @@ def unzip_direct_data_files(bucket_name: str, source_zipped_file_path: str, targ s3_destination_key = f'{target_filepath}{member.name}' # Upload the extracted file to S3 - s3.put_object(Bucket=bucket_name, Key=s3_destination_key, Body=file_content) + if len(file_content) > 5 * 1024 * 1024: + upload_large_file(s3, bucket_name, s3_destination_key, file_content) + else: + s3.put_object(Bucket=bucket_name, Key=s3_destination_key, Body=file_content) return True except tarfile.TarError or gzip.BadGzipFile as e: if isinstance(tarfile.TarError, e): @@ -443,157 +681,3 @@ def unzip_direct_data_files(bucket_name: str, source_zipped_file_path: str, targ exception=e, context=None) return False - - -def retrieve_direct_data_files(list_files_response: DirectDataResponse, bucket_name: str, - starting_directory: str) -> bool: - """ - This method retrieves Direct Data files and stores them on a specified S3 bucket. If there are multiple parts to the - file, this method will merge them and push that completely merged file to the S3 bucket. - - :param list_files_response: A Vapil.py response of a List Direct Data Files API call - :param bucket_name: The name of the S3 bucket where the files are to be pushed too - :param starting_directory: The starting directory where the Direct Data file is to be stored in teh S3 bucket - :return: A boolean that signifies whether the operation was successful or not - """ - vault_client: VaultClient = get_vault_client() - - # request: DirectDataRequest = vault_client.new_request(DirectDataRequest) - - s3: BaseClient = boto3.client(service_name="s3", region_name='us-east-1') - for directDataItem in list_files_response.data: - if directDataItem.size > 0: - try: - object_key = f"{starting_directory}" - request: DirectDataRequest = vault_client.new_request(DirectDataRequest) - if directDataItem.fileparts > 1: - multipart_response = s3.create_multipart_upload(Bucket=bucket_name, Key=object_key) - upload_id = multipart_response['UploadId'] - parts = [] - try: - for file_part in directDataItem.filepart_details: - file_part_number = file_part.filepart - response: VaultResponse = request.download_direct_data_file(file_part.name, - file_part_number) - - response = s3.upload_part( - Bucket=bucket_name, - Key=object_key, - UploadId=upload_id, - PartNumber=file_part_number, - Body=response.binary_content - ) - - part_info = {'PartNumber': file_part_number, 'ETag': response['ETag']} - parts.append(part_info) - - s3.complete_multipart_upload( - Bucket=bucket_name, - Key=object_key, - UploadId=upload_id, - MultipartUpload={'Parts': parts} - ) - except Exception as e: - # Abort the multipart upload in case of an error - s3.abort_multipart_upload(Bucket=bucket_name, Key=object_key, UploadId=upload_id) - log_message(log_level='Error', - message=f'Multi-file upload aborted', - exception=e, - context=None) - raise e - else: - try: - response: VaultResponse = request.download_direct_data_file( - directDataItem.filepart_details[0].name, - directDataItem.filepart_details[0].filepart) - - log_message(log_level='Debug', - message=f'Bucket Name: {bucket_name}, Object key: {object_key}', - context=None) - s3.put_object(Bucket=bucket_name, Key=object_key, Body=response.binary_content) - - - except Exception as e: - # Abort the multipart upload in case of an error - log_message(log_level='Error', - message=f'Could not upload content to S3', - exception=e, - context=None) - raise e - except Exception as e: - # Abort the multipart upload in case of an error - log_message(log_level='Error', - message=f'Direct Data retrieval aborted', - exception=e, - context=None) - raise e - return True - - -def list_direct_data_files(start_time: str, stop_time: str, extract_type: str) -> DirectDataResponse | ResponseClass: - """ - This method lists the Direct Data files generated by Vault. - The retrieval is filtered by the provided start and stop times. - - :param start_time: The start time of the Direct Data file generation - :param stop_time: The stop time of the Direct Data file generation - :param extract_type: The extract type (incremental or full) - :return: The Vault API response provided by Vapil.py - """ - vault_client: VaultClient = get_vault_client() - - try: - request: DirectDataRequest = vault_client.new_request(DirectDataRequest) - response: DirectDataResponse = request.retrieve_available_direct_data_files( - extract_type=ExtractType(extract_type.lower()), - start_time=start_time, stop_time=stop_time) - - if response.has_errors(): - raise Exception(response.errors[0].message) - except Exception as e: - log_message(log_level='Error', - message=f'Exception when listing Direct Data files', - exception=e, - context=None) - raise e - - return response - - -def get_vault_client() -> VaultClient | ResponseClass: - """ - This generates a Vault Client from Vapil.py given the proper credentials for the target Vault. - :return: A valid, authenticated Vault Client - """ - current_region = str(boto3.Session().region_name) - integration_class: IntegrationConfigClass = IntegrationConfigClass(current_region) - - vault_client: VaultClient = VaultClient( - vault_client_id='Veeva-Vault-DevSupport-Direct-Data', - vault_username=integration_class.config.get("vault", "username"), - vault_password=integration_class.config.get("vault", "password"), - vault_dns=integration_class.config.get("vault", "dns"), - authentication_type=AuthenticationType.BASIC) - - try: - log_message(log_level='Info', - message='Vault Client is attempting to authenticate', - context=None) - vault_client.authenticate() - - auth_response: AuthenticationResponse = vault_client.authentication_response - - if auth_response.responseStatus == "SUCCESS": - return vault_client - else: - log_message(log_level='Error', - message=f'Vault Client failed to authenticate: {auth_response.errors[0].message}', - context=None) - return ResponseClass(500, auth_response.errors[0].message) - - except Exception as e: - log_message(log_level='Error', - message=f'Could not authenticate', - exception=e, - context=None) - return ResponseClass(500, e) diff --git a/common/vault/vault_utility_interface.py b/common/vault/vault_utility_interface.py new file mode 100644 index 0000000..227e978 --- /dev/null +++ b/common/vault/vault_utility_interface.py @@ -0,0 +1,282 @@ +import json +import time +import urllib.parse +from pathlib import Path +from typing import List + +import boto3 +from botocore.client import BaseClient + +from common.api.client.vault_client import VaultClient, AuthenticationType +from common.api.model.response.authentication_response import AuthenticationResponse +from common.api.model.response.direct_data_response import DirectDataResponse +from common.api.model.response.document_response import DocumentExportResponse +from common.api.model.response.jobs_response import JobCreateResponse +from common.api.model.response.vault_response import VaultResponse +from common.api.request.authentication_request import AuthenticationRequest +from common.api.request.direct_data_request import DirectDataRequest, ExtractType +from common.api.request.file_staging_request import FileStagingRequest +from common.api.request.document_request import DocumentRequest +from common.integrationConfigClass import IntegrationConfigClass +from common.log_message import log_message +from common.responseClass import ResponseClass + +vault_client: VaultClient = None + + +def get_vault_client(settings, secret) -> VaultClient | ResponseClass: + """ + This generates a Vault Client from Vapil.py given the proper credentials for the target Vault. + :return: A valid, authenticated Vault Client + """ + global vault_client + + try: + if vault_client is None: + vault_client = VaultClient( + vault_client_id='Veeva-Vault-DevSupport-Direct-Data', + vault_username=settings.config.get(secret, "vault_username"), + vault_password=settings.config.get(secret, "vault_password"), + vault_dns=settings.config.get(secret, "vault_dns"), + authentication_type=AuthenticationType.BASIC) + else: + if vault_client.validate_session(auth_request=vault_client.new_request(AuthenticationRequest)): + return vault_client + + log_message(log_level='Info', + message='Vault Client is attempting to authenticate', + context=None) + + vault_client.authenticate() + + auth_response: AuthenticationResponse = vault_client.authentication_response + + if auth_response.responseStatus == "SUCCESS": + return vault_client + else: + log_message(log_level='Error', + message=f'Vault Client failed to authenticate: {auth_response.errors[0].message}', + context=None) + return ResponseClass(500, auth_response.errors[0].message) + + except Exception as e: + log_message(log_level='Error', + message=f'Could not authenticate', + exception=e, + context=None) + print(e) + return ResponseClass(500, e) + + +def retrieve_direct_data_files(list_files_response: DirectDataResponse, bucket_name: str, + starting_directory: str, secret_name: str, settings: IntegrationConfigClass) -> bool: + """ + This method retrieves Direct Data files and stores them on a specified S3 bucket. If there are multiple parts to the + file, this method will merge them and push that completely merged file to the S3 bucket. + + :param settings: The secret manager settings specified + :param secret_name: The specified secret configuration within the settings file + :param list_files_response: A Vapil.py response of a List Direct Data Files API call + :param bucket_name: The name of the S3 bucket where the files are to be pushed too + :param starting_directory: The starting directory where the Direct Data file is to be stored in teh S3 bucket + :return: A boolean that signifies whether the operation was successful or not + """ + vault_client: VaultClient = get_vault_client(settings=settings, secret=secret_name) + + # request: DirectDataRequest = vault_client.new_request(DirectDataRequest) + + s3: BaseClient = boto3.client(service_name="s3") + # Iterate through the Direct Data file data + for directDataItem in list_files_response.data: + # Only execute if there are records present + if directDataItem.record_count > 0: + try: + object_key = f"{starting_directory}" + request: DirectDataRequest = vault_client.new_request(DirectDataRequest) + # If there are more than one file parts, then merge the file parts into one valid file and push to + # the specified S3 bucket. Otherwise just push the entire file to S3. + if directDataItem.fileparts > 1: + multipart_response = s3.create_multipart_upload(Bucket=bucket_name, Key=object_key) + upload_id = multipart_response['UploadId'] + parts = [] + try: + for file_part in directDataItem.filepart_details: + file_part_number = file_part.filepart + response: VaultResponse = request.download_direct_data_file(file_part.name, + file_part_number) + + response = s3.upload_part( + Bucket=bucket_name, + Key=object_key, + UploadId=upload_id, + PartNumber=file_part_number, + Body=response.binary_content + ) + + part_info = {'PartNumber': file_part_number, 'ETag': response['ETag']} + parts.append(part_info) + + s3.complete_multipart_upload( + Bucket=bucket_name, + Key=object_key, + UploadId=upload_id, + MultipartUpload={'Parts': parts} + ) + except Exception as e: + # Abort the multipart upload in case of an error + s3.abort_multipart_upload(Bucket=bucket_name, Key=object_key, UploadId=upload_id) + log_message(log_level='Error', + message=f'Multi-file upload aborted', + exception=e, + context=None) + raise e + else: + try: + response: VaultResponse = request.download_direct_data_file( + directDataItem.filepart_details[0].name, + directDataItem.filepart_details[0].filepart) + + log_message(log_level='Debug', + message=f'Bucket Name: {bucket_name}, Object key: {object_key}', + context=None) + s3.put_object(Bucket=bucket_name, Key=object_key, Body=response.binary_content) + + + except Exception as e: + # Abort the multipart upload in case of an error + log_message(log_level='Error', + message=f'Could not upload content to S3', + exception=e, + context=None) + raise e + except Exception as e: + # Abort the multipart upload in case of an error + log_message(log_level='Error', + message=f'Direct Data retrieval aborted', + exception=e, + context=None) + raise e + else: + log_message(log_level='Info', + message=f'No records in the Direct Data extract.', + context=None) + return False + return True + + +def list_direct_data_files(start_time: str, stop_time: str, extract_type: str, + secret: str, settings: IntegrationConfigClass) -> DirectDataResponse | ResponseClass: + """ + This method lists the Direct Data files generated by Vault. + The retrieval is filtered by the provided start and stop times. + + :param secret: The specified secret configuration + :param start_time: The start time of the Direct Data file generation + :param stop_time: The stop time of the Direct Data file generation + :param extract_type: The extract type (incremental or full) + :return: The Vault API response provided by Vapil.py + """ + + vault_client: VaultClient = get_vault_client(settings=settings, secret=secret) + + try: + request: DirectDataRequest = vault_client.new_request(DirectDataRequest) + response: DirectDataResponse = request.retrieve_available_direct_data_files( + extract_type=ExtractType(extract_type.lower()), + start_time=start_time, stop_time=stop_time) + + if response.has_errors(): + raise Exception(response.errors[0].message) + except Exception as e: + log_message(log_level='Error', + message=f'Exception when listing Direct Data files', + exception=e, + context=None) + raise e + + return response + + +def export_documents(doc_version_ids: List[str], secret: str, settings: IntegrationConfigClass) -> JobCreateResponse: + vault_client = get_vault_client(settings=settings, secret=secret) + try: + request_string = [] + for doc_version_id in doc_version_ids: + split_version_id: list[str] = doc_version_id.split('_') + doc_id = split_version_id[0] + major_version = split_version_id[1] + minor_version = split_version_id[2] + + doc_version_dict = { + "id": doc_id, + "major_version_number__v": major_version, + "minor_version_number__v": minor_version + } + + request_string.append(doc_version_dict) + + log_message(log_level='Debug', + message=f'Vault Client authenticated. Exporting now', + context=None) + + doc_request: DocumentRequest = vault_client.new_request(DocumentRequest) + doc_response: JobCreateResponse = doc_request.export_document_versions( + request_string=json.dumps(request_string), + include_source=True, + include_renditions=False) + log_message(log_level='Debug', + message=f'Job Initiated', + context=None) + return doc_response + except Exception as e: + raise e + + +def download_documents_to_s3(job_id: int, target_path: str, bucket_name: str, secret: str, + settings: IntegrationConfigClass) -> List[str]: + global vault_client + try: + vault_client = get_vault_client(settings=settings, secret=secret) + + s3 = boto3.client('s3') + + is_vault_job_finished = False + + log_message(log_level='Info', + message=f'Polling status of job {job_id} in Vault', + context=None) + while not is_vault_job_finished: + document_request: DocumentRequest = vault_client.new_request(DocumentRequest) + response: DocumentExportResponse = document_request.retrieve_document_export_results(job_id=job_id) + + log_message(log_level='Debug', + message=f'Document Export results: {response}', + context=None) + + if response.responseStatus == 'SUCCESS': + for exported_document in response.data: + log_message(log_level='Debug', + message=f'File Path on Staging Server: {exported_document.file}', + context=None) + file_staging_request: FileStagingRequest = vault_client.new_request(FileStagingRequest) + log_message(log_level='Debug', + message=f'File Staging Request: {file_staging_request}', + context=None) + file_path = str(Path(f'u{exported_document.user_id__v}{exported_document.file}')) + file_staging_response: VaultResponse = file_staging_request.download_item_content( + item=urllib.parse.quote(file_path)) + log_message(log_level='Debug', + message=f'File Staging results: {file_staging_response}', + context=None) + s3.put_object(Bucket=bucket_name, + Key=f'{target_path}/{exported_document.id}_{exported_document.major_version_number__v}_{exported_document.minor_version_number__v}', + Body=file_staging_response.binary_content) + is_vault_job_finished = True + else: + log_message(log_level='Debug', + message=f'Waiting 12s to check the job status', + context=None) + time.sleep(12) + + except Exception as e: + raise e diff --git a/job/run.py b/job/run.py index fd05582..717da59 100644 --- a/job/run.py +++ b/job/run.py @@ -1,16 +1,18 @@ import sys -from common.log_message import log_message +from common.api.model.response.jobs_response import JobCreateResponse +from common.vault.vault_utility_interface import list_direct_data_files, retrieve_direct_data_files, export_documents, \ + download_documents_to_s3 sys.path.append('.') import os import json from typing import Dict - -from common.aws_utilities import invoke_lambda, start_batch_job, get_batch_region -from common.direct_data_files_interface import list_direct_data_files, retrieve_direct_data_files, \ - unzip_direct_data_files +from common.log_message import log_message +from common.aws_utilities import invoke_lambda, start_batch_job, get_batch_region, retrieve_doc_version_ids_from_s3, \ + check_file_exists_s3, get_s3_path +from common.vault.direct_data_files_interface import unzip_direct_data_files, verify_redshift_tables from common.integrationConfigClass import IntegrationConfigClass from common.api.model.response.direct_data_response import DirectDataResponse @@ -20,17 +22,39 @@ def main(): log_message(log_level='Info', message=f'Current region of batch job: {current_region}', context=None) - settings = IntegrationConfigClass(current_region) + secret_name = os.environ.get("SECRET_NAME") + secret = os.environ.get("SECRET") + + log_message(log_level='Info', + message=f'Retrieving {secret} secret from {secret_name}', + context=None) + + settings = IntegrationConfigClass(current_region, secret_name) step = os.environ.get("STEP") extract_type = os.environ.get("EXTRACT_TYPE") - continue_processing = os.environ.get("CONTINUE_PROCESSING") + continue_processing = os.environ.get("CONTINUE_PROCESSING", "false").lower() == "true" + + log_message(log_level='Debug', + message=f'Raw Continue Processing: {os.environ.get("CONTINUE_PROCESSING")}', + context=None) + + if continue_processing is None: + continue_processing = False + + log_message(log_level='Debug', + message=f'Continue Processing: {continue_processing}', + context=None) log_message(log_level='Info', message=f'Starting Transaction with {step} step of a {extract_type} extract', context=None) - s3_bucket = settings.config.get("s3", "bucket_name") - s3_directory = settings.config.get("s3", "starting_directory") + s3_bucket = settings.config.get(secret, "s3_bucket_name") + s3_directory = settings.config.get(secret, "s3_starting_directory") + + job_name = settings.config.get(secret, 'job_name') + job_queue = settings.config.get(secret, 'job_queue') + job_definition = settings.config.get(secret, 'job_definition') if step == "retrieve": start_time = os.environ.get("START_TIME") @@ -38,56 +62,177 @@ def main(): log_message(log_level='Info', message=f'Listing Direct Data files with start time: {start_time} and stop time: {stop_time}', context=None) + # List the the Direct Data files of the specified extract type and time window list_of_direct_data_files_response: DirectDataResponse = list_direct_data_files(start_time=str(start_time), stop_time=str(stop_time), - extract_type=f'{extract_type}_directdata') + extract_type=f'{extract_type}_directdata', + secret=secret, + settings=settings) + # If the file listing was successful and the response is not empty, download the latest Direct Data file in + # the response. if list_of_direct_data_files_response.is_successful() and bool(list_of_direct_data_files_response.data): direct_data_item = list_of_direct_data_files_response.data[-1] file_path_name = direct_data_item.name file_name = direct_data_item.filename retrieval_success = retrieve_direct_data_files(list_files_response=list_of_direct_data_files_response, bucket_name=s3_bucket, - starting_directory=f'{s3_directory}/{file_name}') + starting_directory=f'{s3_directory}/{file_name}', + secret_name=secret, + settings=settings) if retrieval_success: - job_name = settings.config.get('batch', 'job_name') - job_queue = settings.config.get('batch', 'job_queue') - job_definition = settings.config.get('batch', 'job_definition') - job_parameter: Dict[str, str] = {'step': 'unzip', - 'source_filepath': f'{s3_directory}/{file_name}', - 'target_filepath': f'{s3_directory}/{file_path_name}', - 'extract_type': f'{extract_type}', - 'continue_processing': f'{continue_processing}'} - - batch_job_response = start_batch_job(job_name=f'{job_name}-unzip', job_queue=job_queue, - job_definition=job_definition, - job_parameters=job_parameter) + function_name = settings.config.get(secret, 'lambda_function_name') + + payload: Dict[str, str] = {'step': 'unzip', + 'source_filepath': f'{s3_directory}/{file_name}', + 'target_directory': f'{s3_directory}/{file_path_name}', + 'extract_type': f'{extract_type}', + 'continue_processing': f'{continue_processing}', + 'secret': f'{secret}'} + + invoke_lambda(function_name=function_name, payload=json.dumps(payload)) log_message(log_level='Info', - message=f'Starting {job_name} with ID: {batch_job_response["jobId"]} to unzip files', + message=f'Invoking {function_name} with unzip step', context=None) elif step == "unzip": source_filepath = os.environ.get("SOURCE_FILEPATH") - target_filepath = os.environ.get("TARGET_FILEPATH") + target_directory = os.environ.get("TARGET_DIRECTORY") log_message(log_level='Info', - message=f'Unzipping {source_filepath} to {target_filepath}', + message=f'Unzipping {source_filepath} to {target_directory}', context=None) successful_unzip = unzip_direct_data_files(bucket_name=s3_bucket, source_zipped_file_path=source_filepath, - target_filepath=f'{target_filepath}/') + target_filepath=f'{target_directory}/') if successful_unzip and continue_processing: - function_name = settings.config.get('lambda', 'function_name') + function_name = settings.config.get(secret, 'lambda_function_name') payload: Dict[str, str] = {'step': 'load_data', - 'source_file': f'{target_filepath}', - 'extract_type': f'{extract_type}'} + 'source_filepath': f'{target_directory}', + 'extract_type': f'{extract_type}', + 'secret': f'{secret}'} invoke_lambda(function_name=function_name, payload=json.dumps(payload)) log_message(log_level='Info', message=f'Invoking AWS Lambda {function_name} to load the data into Redshift', context=None) + elif step == "load_data": + + source_filepath = os.environ.get("SOURCE_FILEPATH") + extract_source_content = os.environ.get("EXTRACT_SOURCE_CONTENT", "false").lower() == "true" + + if extract_source_content is None: + extract_source_content = False + + log_message(log_level='Debug', + message=f'Source filepath: {source_filepath} and Extract Source content is {extract_source_content}', + context=None) + + # Generate the schema name with the given Vault ID from the Direct Data filename + vault_id = source_filepath.split("/")[-1].split("-")[0] + schema_name = f'vault_{vault_id}' + try: + # Get the S3 filepath of the manifest.csv file + manifest_filepath = get_s3_path("manifest", s3_bucket, source_filepath) + log_message(log_level='Debug', + message=f'The manifest file: {manifest_filepath}', + context=None) + # Get the S3 filepath of the metadata.csv file + metadata_filepath = get_s3_path("metadata.csv", s3_bucket, source_filepath) + # If the metadata.csv file does not exist, then retrieve the metadata_full.csv filepath + if metadata_filepath is None or not metadata_filepath.strip(): + metadata_filepath = get_s3_path("metadata_full.csv", s3_bucket, source_filepath) + log_message(log_level='Info', + message=f'The metadata file: {metadata_filepath}', + context=None) + # Get the S3 filepath of the metadata_deletes.csv file. This only exists in incremental extracts. + metadata_deletes_filepath = get_s3_path("metadata_deletes.csv", s3_bucket, source_filepath) + except Exception as e: + log_message(log_level='Info', + message=f'Errors encountered when search for files in S3', + exception=e, + context=None) + try: + # Verify and subsequently load the Direct Data into the specified Redshift database that is specified in + # the Secrets Manager. + verify_redshift_tables(chunk_size=500, + bucket_name=s3_bucket, + manifest_path=manifest_filepath, + metadata_path=metadata_filepath, + starting_directory=source_filepath, + extract_type=extract_type, + metadata_deletes_filepath=metadata_deletes_filepath, + schema_name=schema_name, + settings=settings, + extract_docs=extract_source_content, + secret=secret) + + log_message(log_level='Info', + message='Successfully loaded Vault Direct Data into Redshift', + context=None) + + except Exception as e: + log_message(log_level='Info', + message=f'Errors encountered when attempting to load the data', + exception=e, + context=None) + + elif step == "extract_docs": + doc_version_id_filepath = os.environ.get("DOC_VERSION_IDS") + starting_directory: str = os.environ.get("SOURCE_FILEPATH") + if check_file_exists_s3(bucket_name=s3_bucket, file_key=doc_version_id_filepath[5:].split("/", 1)[1]): + try: + doc_version_ids = retrieve_doc_version_ids_from_s3(doc_version_id_filepath) + + log_message(log_level='Info', + message=f'Downloading {doc_version_ids} to {starting_directory}', + context=None) + + log_message(log_level='Info', + message=f'Exporting documents from Vault', + context=None) + export_documents_response: JobCreateResponse = export_documents(doc_version_ids=doc_version_ids, + secret=secret, settings=settings) + + if export_documents_response.responseStatus == 'SUCCESS': + log_message(log_level='Info', + message=f'Export successful. Downloading documents to S3', + context=None) + download_documents_to_s3(job_id=export_documents_response.job_id, + target_path=f'{starting_directory}', + bucket_name=s3_bucket, secret=secret, settings=settings) + else: + log_message(log_level='Error', + message=f'Error encountered when attempting to export documents', + exception=Exception(export_documents_response.errors[0].message), + context=None) + raise Exception(export_documents_response.errors[0].message) + except Exception as e: + log_message(log_level='Error', + message=f'Errors encountered while exporting source content from Vault', + exception=e, + context=None) + + if check_file_exists_s3(bucket_name=s3_bucket, file_key=doc_version_id_filepath[5:].split("/", 1)[1]): + + job_parameter: Dict[str, str] = {'step': 'extract_docs', + 'source_filepath': f'{starting_directory}', + 'extract_type': 'incremental', + 'doc_version_ids': f'{doc_version_id_filepath}'} + + start_batch_job(job_name=f'{job_name}-export', job_queue=job_queue, + job_definition=job_definition, + job_parameters=job_parameter) + else: + log_message(log_level='Info', + message=f'All documents exported successfully', + context=None) + else: + log_message(log_level='Info', + message=f'Document ID file does not exist', + context=None) log_message(log_level='Info', message=f'AWS Batch job finished successfully', context=None)