Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find-moj-data-83/glossary #3449

Merged
merged 5 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions python-libraries/data-platform-catalogue/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.16.0] 2024-02-20

### Added

- a get_glossary method in the datahub client and SearchClient

## [0.15.0] 2024-02-20

### Added
Expand All @@ -18,10 +24,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed

- bugfix - search now returns correct page results, where start is
individual search result index.
individual search result index.
- bugfix - `upsert_table` client method now adds dataset name to datahub.
- bugfix - `upsert_table` client method no longer duplicates assets assocaited
with data product.
with data product.

## [0.13.0] 2024-02-14

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,3 +386,7 @@ def list_data_product_assets(self, urn, count, start=0) -> SearchResponse:
return self.search_client.list_data_product_assets(
urn=urn, count=count, start=start
)

def get_glossary_terms(self, count: int = 1000) -> SearchResponse:
"""Wraps the client's glossary terms query"""
return self.search_client.get_glossary_terms(count)
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
query getGlossaryTerms($count: Int!) {
searchAcrossEntities(
input: { types: GLOSSARY_TERM, query: "*", start: 0, count: $count }
) {
start
count
total
searchResults {
entity {
... on GlossaryTerm {
urn
properties {
name
description
}
parentNodes {
nodes {
properties {
name
description
}
}
}
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def __init__(self, graph: DataHubGraph):
.joinpath("listDataProductAssets.graphql")
.read_text()
)
self.get_glossary_terms_query = (
files("data_platform_catalogue.client.datahub.graphql")
.joinpath("getGlossaryTerms.graphql")
.read_text()
)

def search(
self,
Expand Down Expand Up @@ -175,6 +180,8 @@ def _map_result_types(self, result_types: Sequence[ResultType]):
types.append("DATA_PRODUCT")
if ResultType.TABLE in result_types:
types.append("DATASET")
if ResultType.GLOSSARY_TERM in result_types:
types.append("GLOSSARY_TERM")
return types

def _map_filters(self, filters: Sequence[MultiSelectFilter]):
Expand Down Expand Up @@ -335,3 +342,39 @@ def _parse_facets(self, facets: list[dict[str, Any]]) -> SearchFacets:
results[field] = options

return SearchFacets(results)

def _parse_glossary_term(self, entity) -> SearchResult:
properties, custom_properties = self._parse_properties(entity)
metadata = {"parentNodes": entity["parentNodes"]["nodes"]}

return SearchResult(
id=entity["urn"],
result_type=ResultType.GLOSSARY_TERM,
matches={},
name=properties["name"],
description=properties.get("description", ""),
metadata=metadata,
tags=[],
last_updated=None,
)

def get_glossary_terms(self, count: int = 1000) -> SearchResponse:
murdo-moj marked this conversation as resolved.
Show resolved Hide resolved
"Get some number of glossary terms from DataHub"
variables = {"count": count}
try:
response = self.graph.execute_graphql(
self.get_glossary_terms_query, variables
)
except GraphError as e:
raise Exception("Unable to execute getGlossaryTerms query") from e

page_results = []
response = response["searchAcrossEntities"]
logger.debug(json.dumps(response, indent=2))

for result in response["searchResults"]:
page_results.append(self._parse_glossary_term(entity=result["entity"]))

return SearchResponse(
total_results=response["total"], page_results=page_results
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
class ResultType(Enum):
DATA_PRODUCT = auto()
TABLE = auto()
GLOSSARY_TERM = auto()


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion python-libraries/data-platform-catalogue/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ministryofjustice-data-platform-catalogue"
version = "0.15.0"
version = "0.16.0"
description = "Library to integrate the MoJ data platform with the catalogue component."
authors = ["MoJ Data Platform Team <[email protected]>"]
license = "MIT"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -740,3 +740,77 @@ def test_list_data_product_assets(mock_graph, searcher):
)
],
)


def test_get_glossary_terms(mock_graph, searcher):
datahub_response = {
"searchAcrossEntities": {
"start": 0,
"count": 2,
"total": 2,
"searchResults": [
{
"entity": {
"urn": "urn:li:glossaryTerm:022b9b68-c211-47ae-aef0-2db13acfeca8",
"properties": {
"name": "IAO",
"description": "Information asset owner.\n",
},
"parentNodes": {
"nodes": [
{
"properties": {
"name": "Data protection terms",
"description": "Data protection terms",
}
}
]
},
}
},
{
"entity": {
"urn": "urn:li:glossaryTerm:0eb7af28-62b4-4149-a6fa-72a8f1fea1e6",
"properties": {
"name": "Security classification",
"description": "Only data that is 'official'",
},
"parentNodes": {"nodes": []},
}
},
],
}
}

mock_graph.execute_graphql = MagicMock(return_value=datahub_response)

response = searcher.get_glossary_terms(count=2)
print(response)
assert response == SearchResponse(
total_results=2,
page_results=[
SearchResult(
id="urn:li:glossaryTerm:022b9b68-c211-47ae-aef0-2db13acfeca8",
name="IAO",
description="Information asset owner.\n",
metadata={
"parentNodes": [
{
"properties": {
"name": "Data protection terms",
"description": "Data protection terms",
}
}
]
},
result_type=ResultType.GLOSSARY_TERM,
),
SearchResult(
id="urn:li:glossaryTerm:0eb7af28-62b4-4149-a6fa-72a8f1fea1e6",
name="Security classification",
description="Only data that is 'official'",
metadata={"parentNodes": []},
result_type=ResultType.GLOSSARY_TERM,
),
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,10 @@ def test_list_data_product_assets_returns():
urn="urn:li:dataProduct:my_data_product", count=20
)
assert assets


@runs_on_development_server
def test_get_glossary_terms_returns():
client = DataHubCatalogueClient(jwt_token=jwt_token, api_url=api_url)
assets = client.get_glossary_terms(count=20)
assert assets
Loading