Skip to content

Commit

Permalink
get_schema lambda returns schema in original upload format (#1972)
Browse files Browse the repository at this point in the history
* resolves #1659
* add format_table_schema function to base schema
* get_schema lambda returns reformatted schema
* tweak get_schema tests
* Update data_product_metadata.py
* remove owner and retention from original schema format
* update docs with get schema endpoint
* rename table schema component
  • Loading branch information
tom-webber authored Oct 19, 2023
1 parent 7814295 commit b256dfe
Show file tree
Hide file tree
Showing 12 changed files with 160 additions and 29 deletions.
7 changes: 7 additions & 0 deletions containers/daap-docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.0.6] 2023-10-19

### Added

- Documentation for `/data-product/{data-product-name}/table/{table-name}/schema`
GET endpoint.

## [1.0.5] 2023-10-16

### Updated
Expand Down
2 changes: 1 addition & 1 deletion containers/daap-docs/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "daap-docs",
"version": "1.0.5",
"version": "1.0.6",
"registry": "ecr",
"ecr": {
"role": "arn:aws:iam::013433889002:role/modernisation-platform-oidc-cicd",
Expand Down
59 changes: 57 additions & 2 deletions containers/daap-docs/src/var/task/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/registerTableSchema"
"$ref": "#/components/schemas/tableSchema"
}
}
}
Expand Down Expand Up @@ -325,12 +325,67 @@
}
}
}
},
"get": {
"tags": [
"get_schema"
],
"summary": "Get the latest schema for a table.",
"description": "Returns a json object describing the table.",
"operationId": "getTableSchema",
"parameters": [
{
"in": "path",
"name": "data-product-name",
"required": true,
"schema": {
"type": "string",
"minimum": 1
},
"description": "The name of the data product that contains the table schema. This needs to be a registered data product name."
},
{
"in": "path",
"name": "table-name",
"required": true,
"schema": {
"type": "string",
"minimum": 1,
"maximum": 128
},
"description": "The name of the table with the schema. Allows lowercase alphanumeric characters and \"_\"."
},
{
"in": "header",
"name": "authorizationToken",
"schema": {
"type": "string",
"format": "uuid"
},
"required": true
}
],
"responses": {
"200": {
"description": "successful operation, response format https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/client/get_table.html",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/tableSchema"
}
}
}
},
"404": {
"description": "Data product or table schema not found"
}
}
}
}
},
"components": {
"schemas": {
"registerTableSchema": {
"tableSchema": {
"required": [
"schema"
],
Expand Down
7 changes: 7 additions & 0 deletions containers/daap-get-schema/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.1.0] - 2023-10-18

### Changed

- reformat schema on get using base `format_table_schema` function to return schema
matching input format rather than glue format

## [1.0.0]

### Added
Expand Down
2 changes: 1 addition & 1 deletion containers/daap-get-schema/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "daap-get-schema",
"version": "1.0.0",
"version": "1.1.0",
"registry": "ecr",
"ecr": {
"role": "arn:aws:iam::013433889002:role/modernisation-platform-oidc-cicd",
Expand Down
9 changes: 6 additions & 3 deletions containers/daap-get-schema/src/var/task/get_schema.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import os
from http import HTTPStatus

import botocore
from data_platform_api_responses import format_response_json, response_status_404
from data_platform_logging import DataPlatformLogger
from data_platform_paths import DataProductConfig
from data_product_metadata import format_table_schema
from dataengineeringutils3.s3 import read_json_from_s3


Expand All @@ -25,7 +27,7 @@ def handler(event, context):
schema_path = config.schema_path(table_name)

try:
registered_schema = read_json_from_s3(schema_path.uri)
registered_glue_schema = read_json_from_s3(schema_path.uri)
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "NoSuchKey":
message = f"Schema not found for data product '{data_product_name}', table '{table_name}'"
Expand All @@ -34,7 +36,8 @@ def handler(event, context):
else:
raise

# TODO: return an intelligible subset of the full glue metadata
registered_schema = format_table_schema(registered_glue_schema)

return format_response_json(
status_code=200, json_body=json.dumps(registered_schema)
status_code=HTTPStatus.OK, json_body=json.dumps(registered_schema)
)
3 changes: 2 additions & 1 deletion containers/daap-get-schema/src/var/task/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
boto3==1.28.9
botocore==1.31.9
botocore==1.31.39
dataengineeringutils3==1.4.3
jsonschema==4.19.0
48 changes: 30 additions & 18 deletions containers/daap-get-schema/tests/unit/mock_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from unittest.mock import patch

import pytest
from get_schema import handler
Expand All @@ -14,38 +15,49 @@ def table_name(self):
return "table-name"

@pytest.fixture
def schema(self):
def glue_schema(self):
return {"hello": "world"}

@pytest.fixture
def out_schema(self):
return {"goodbye": "world"}

@pytest.fixture(autouse=True)
def setup_metadata_bucket(
self, s3_client, data_product_name, table_name, schema, monkeypatch
self, s3_client, data_product_name, table_name, glue_schema, monkeypatch
):
monkeypatch.setenv("METADATA_BUCKET", "metadata")

s3_client.create_bucket(Bucket="metadata")
s3_client.put_object(
Bucket="metadata",
Key=f"{data_product_name}/v1.0/{table_name}/schema.json",
Body=json.dumps(schema),
Body=json.dumps(glue_schema),
)

def test_valid(self, fake_context, data_product_name, table_name, schema):
result = handler(
{
"pathParameters": {
"data-product-name": data_product_name,
"table-name": table_name,
}
},
fake_context,
)
def test_valid(
self,
fake_context,
data_product_name,
table_name,
out_schema,
):
with patch("get_schema.format_table_schema", return_value=out_schema):
result = handler(
{
"pathParameters": {
"data-product-name": data_product_name,
"table-name": table_name,
}
},
fake_context,
)

assert result == {
"body": json.dumps(schema),
"headers": {"Content-Type": "application/json"},
"statusCode": 200,
}
assert result == {
"body": json.dumps(out_schema),
"headers": {"Content-Type": "application/json"},
"statusCode": 200,
}

def test_missing(self, fake_context):
result = handler(
Expand Down
9 changes: 8 additions & 1 deletion containers/daap-python-base/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [4.0.1] - 2023-10-18

### Added

- standalone `format_table_schema` function for converting glue schema to input schema
format

## [4.0.0] - 2023-10-05

### Changed
Expand All @@ -24,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- 403 error status code to `data_platform_api_responses.py`
- `get_new_version` function to `data_platform_paths``
- `get_new_version` function to `data_platform_paths`

## [3.3.1] - 2023-10-11

Expand Down
2 changes: 1 addition & 1 deletion containers/daap-python-base/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "daap-python-base",
"version": "4.0.0",
"version": "4.0.1",
"registry": "ghcr"
}
25 changes: 25 additions & 0 deletions containers/daap-python-base/src/var/task/data_product_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,31 @@
}


def format_table_schema(glue_schema: dict) -> dict:
"""reformats a glue table schema into the metadata ingestion specification.
Args:
glue_schema (dict): Schema in Glue-compatible format
Returns:
dict: Schema in original ingested metadata format
"""
table_input = glue_schema["TableInput"]
columns = table_input["StorageDescriptor"]["Columns"]

return {
"tableDescription": table_input.get("Description"),
"columns": [
{
"name": column["Name"],
"type": column["Type"],
"description": column.get("Comment"),
}
for column in columns
],
}


def get_data_product_specification_path(
spec_type: JsonSchemaName, version: None | str = None
) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
import data_product_metadata
import pytest
from data_platform_paths import JsonSchemaName
from data_product_metadata import DataProductMetadata, DataProductSchema
from data_product_metadata import (
DataProductMetadata,
DataProductSchema,
format_table_schema,
)

test_metadata_pass = {
"name": "test_product",
Expand Down Expand Up @@ -393,3 +397,13 @@ def test_schema_parent_metadata_has_registered_schemas(
input_data=test_schema_pass,
)
assert schema.parent_product_has_registered_schema == expected


@pytest.mark.parametrize(
"glue_schema, expected",
[(test_glue_table_input, test_schema_pass)],
)
def test_format_table_schema(glue_schema, expected):
out_schema = format_table_schema(glue_schema)

assert out_schema == expected

0 comments on commit b256dfe

Please sign in to comment.