Skip to content

Commit

Permalink
Add timestamp validation and associated test (#1231)
Browse files Browse the repository at this point in the history
* Add timestamp validation and associated test

* add formatting changes

* inline entity timestamp fixture

* shortened timedelta value
  • Loading branch information
mitchdawson1982 authored Jan 14, 2025
1 parent 6acd450 commit e59e9c9
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 21 deletions.
29 changes: 16 additions & 13 deletions datahub_client/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
from enum import Enum
from typing import Literal, Optional

from pydantic import BaseModel, EmailStr, Field
from pydantic import AfterValidator, BaseModel, EmailStr, Field
from typing_extensions import Annotated

from .validators import check_timestamp_is_in_the_past

DATAHUB_DATE_FORMAT = "%Y%m%d"

Expand Down Expand Up @@ -542,17 +545,23 @@ class Entity(BaseModel):
]
],
)
metadata_last_ingested: Optional[datetime] = Field(
metadata_last_ingested: Annotated[
Optional[datetime], AfterValidator(check_timestamp_is_in_the_past)
] = Field(
description="When the metadata was last updated in the catalogue",
default=None,
examples=[datetime(2011, 10, 2, 3, 0, 0)],
)
created: Optional[datetime] = Field(
created: Annotated[
Optional[datetime], AfterValidator(check_timestamp_is_in_the_past)
] = Field(
description="When the data entity was first created",
default=None,
examples=[datetime(2011, 10, 2, 3, 0, 0)],
)
data_last_modified: Optional[datetime] = Field(
data_last_modified: Annotated[
Optional[datetime], AfterValidator(check_timestamp_is_in_the_past)
] = Field(
description="When the data entity was last modified in the source system",
default=None,
examples=[datetime(2011, 10, 2, 3, 0, 0)],
Expand Down Expand Up @@ -652,7 +661,9 @@ class Table(Entity):
]
],
)
last_datajob_run_date: Optional[datetime] = Field(
last_datajob_run_date: Annotated[
Optional[datetime], AfterValidator(check_timestamp_is_in_the_past)
] = Field(
description="Indicates the time when the data were last refreshed (eg pipeline run with dbt).",
default=None,
examples=[datetime(2011, 10, 2, 3, 0, 0)],
Expand Down Expand Up @@ -681,11 +692,3 @@ class Dashboard(Entity):
description="URL to view the dashboard",
examples=["https://data.justice.gov.uk"],
)


# if __name__ == "__main__":
# import erdantic as erd

# erd.draw(Database, out="database.png")
# erd.draw(Table, out="table.png")
# erd.draw(Chart, out="chart.png")
7 changes: 7 additions & 0 deletions datahub_client/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from datetime import datetime


def check_timestamp_is_in_the_past(datetime: datetime) -> datetime:
if datetime is not None and datetime.timestamp() >= datetime.now().timestamp():
raise ValueError("timestamp must be in the past")
return datetime
105 changes: 105 additions & 0 deletions tests/datahub_client/test_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from datetime import datetime, timedelta

import pytest

from datahub_client.entities import (
AccessInformation,
CustomEntityProperties,
DataSummary,
DomainRef,
Entity,
EntityRef,
EntitySummary,
FurtherInformation,
GlossaryTermRef,
Governance,
OwnerRef,
RelationshipType,
TagRef,
UsageRestrictions,
)


def test_entity_timestamps_in_future_validation():
future_timestamp = datetime.now() + timedelta(seconds=5)
entity_data_with_timestamps_in_future = {
"urn": "urn:li:chart:(justice-data,absconds)",
"display_name": "Absconds",
"name": "Absconds",
"fully_qualified_name": "",
"description": "Number of absconds",
"relationships": {
RelationshipType.PARENT: [
EntitySummary(
entity_ref=EntityRef(
urn="urn:li:database:example", display_name="example"
),
description="entity for an example",
entity_type="DATABASE",
tags=[
TagRef(
urn="urn:li:tag:dc_display_in_catalogue",
display_name="dc_display_in_catalogue",
)
],
)
]
},
"domain": DomainRef(display_name="HMPPS", urn="urn:li:domain:HMCTS"),
"governance": Governance(
data_owner=OwnerRef(
display_name="John Doe",
email="[email protected]",
urn="urn:li:corpuser:john.doe",
),
data_stewards=[
OwnerRef(
display_name="Jane Smith",
email="[email protected]",
urn="urn:li:corpuser:jane.smith",
)
],
data_custodians=[
OwnerRef(
display_name="Rosanne Columns",
email="[email protected]",
urn="urn:li:corpuser:rosanne.columns",
)
],
),
"glossary_terms": [
GlossaryTermRef(
display_name="Essential Shared Data Asset (ESDA)",
urn="urn:li:glossaryTerm:ESDA",
description="An ESDA is...",
)
],
"metadata_last_ingested": future_timestamp,
"created": future_timestamp,
"data_last_modified": future_timestamp,
"platform": EntityRef(urn="urn:li:dataPlatform:kafka", display_name="Kafka"),
"custom_properties": CustomEntityProperties(
usage_restrictions=UsageRestrictions(
dpia_required=False, dpia_location="OneTrust"
),
access_information=AccessInformation(
dc_where_to_access_dataset="Analytical platform",
source_dataset_name="stg_xhibit_bw_history",
s3_location="s3://alpha-hmpps-reports-data",
dc_access_requirements="Access granted on request",
),
data_summary=DataSummary(row_count=123, refresh_period="Daily"),
further_information=FurtherInformation(
dc_slack_channel_name="#data-engineering",
dc_slack_channel_url="https://hmpps-data-engineering.slack.com",
dc_teams_channel_name="Data team",
dc_teams_channel_url="https://teams.microsoft.com/l/channel/123",
dc_team_email="[email protected]",
),
),
"tags_to_display": ["nomis", "data-warehouse"],
}
with pytest.raises(ValueError) as exc:
Entity(**entity_data_with_timestamps_in_future)

assert "timestamp must be in the past" in str(exc.value)
13 changes: 8 additions & 5 deletions tests/home/templatetags/test_format_timesince.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from home.templatetags.format_timesince import format_timesince


@pytest.mark.parametrize("timesince, expected_result", [
("30 seconds", "30 seconds"),
("1 hour, 45 minutes", "1 hour"),
("1 day, 6 hours, 45 minutes", "1 day"),
])
@pytest.mark.parametrize(
"timesince, expected_result",
[
("30 seconds", "30 seconds"),
("1 hour, 45 minutes", "1 hour"),
("1 day, 6 hours, 45 minutes", "1 day"),
],
)
def test_format_timesince(timesince, expected_result):
assert format_timesince(timesince) == expected_result
3 changes: 0 additions & 3 deletions tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,3 @@ class TestGlossaryView:
def test_details(self, client):
response = client.get(reverse("home:glossary"))
assert response.status_code == 200



0 comments on commit e59e9c9

Please sign in to comment.