From edb0f19f1b594d662ed12584fe07e6fd348e8f12 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 7 Aug 2024 13:50:08 -0700 Subject: [PATCH 01/12] fix(ingest/tableau): prevent empty site content urls (#11057) Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../src/datahub/ingestion/source/tableau.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 9cde3b1f8d3a0..510cb6c96d1f2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -757,6 +757,12 @@ def _re_authenticate(self): ] = self.config.get_tableau_auth(self.site.content_url) self.server.auth.sign_in(tableau_auth) + @property + def site_content_url(self) -> Optional[str]: + if self.site and self.site.content_url: + return self.site.content_url + return None + def _populate_usage_stat_registry(self) -> None: if self.server is None: return @@ -2524,7 +2530,9 @@ def emit_sheets_as_charts( last_modified = self.get_last_modified(creator, created_at, updated_at) if sheet.get(c.PATH): - site_part = f"/site/{self.site.content_url}" if self.site else "" + site_part = ( + f"/site/{self.site_content_url}" if self.site_content_url else "" + ) sheet_external_url = ( f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(c.PATH)}" ) @@ -2535,7 +2543,7 @@ def emit_sheets_as_charts( and sheet[c.CONTAINED_IN_DASHBOARDS][0].get(c.PATH) ): # sheet contained in dashboard - site_part = f"/t/{self.site.content_url}" if self.site else "" + site_part = f"/t/{self.site_content_url}" if self.site_content_url else "" dashboard_path = sheet[c.CONTAINED_IN_DASHBOARDS][0][c.PATH] sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{quote(sheet.get(c.NAME, ''), safe='')}" else: @@ -2667,7 +2675,7 @@ def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUni else None ) - site_part = f"/site/{self.site.content_url}" if self.site else "" + site_part = f"/site/{self.site_content_url}" if self.site_content_url else "" workbook_uri = workbook.get("uri") workbook_part = ( workbook_uri[workbook_uri.index("/workbooks/") :] if workbook_uri else None @@ -2826,7 +2834,7 @@ def emit_dashboard( updated_at = dashboard.get(c.UPDATED_AT, datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) - site_part = f"/site/{self.site.content_url}" if self.site else "" + site_part = f"/site/{self.site_content_url}" if self.site_content_url else "" dashboard_external_url = ( f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(c.PATH, '')}" ) From c226883097d01daf2fcb18689aee72ac5bf9f1a0 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:53:36 -0500 Subject: [PATCH 02/12] feat(entity-client): implement client batch interface (#11106) --- .../entity/ebean/batch/AspectsBatchImpl.java | 2 +- .../entity/ebean/batch/ProposedItem.java | 8 ++- .../metadata/client/JavaEntityClient.java | 58 ++++++++++----- .../linkedin/entity/client/EntityClient.java | 44 +++++------- .../entity/client/RestliEntityClient.java | 27 +++---- .../tests/privileges/test_privileges.py | 8 +++ smoke-test/tests/privileges/utils.py | 70 ++++++++++++++++++- 7 files changed, 157 insertions(+), 60 deletions(-) diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java index a23f6ab175046..7a1af12272ac5 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java @@ -123,7 +123,7 @@ public AspectsBatchImplBuilder one(BatchItem data, RetrieverContext retrieverCon } public AspectsBatchImplBuilder mcps( - List mcps, + Collection mcps, AuditStamp auditStamp, RetrieverContext retrieverContext) { diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java index 452ed39ddf317..132a731d278af 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java @@ -7,6 +7,7 @@ import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.utils.EntityKeyUtils; import com.linkedin.metadata.utils.GenericRecordUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; @@ -63,7 +64,12 @@ public RecordTemplate getRecordTemplate() { @Nonnull @Override public Urn getUrn() { - return metadataChangeProposal.getEntityUrn(); + Urn urn = metadataChangeProposal.getEntityUrn(); + if (urn == null) { + urn = + EntityKeyUtils.getUrnFromProposal(metadataChangeProposal, entitySpec.getKeyAspectSpec()); + } + return urn; } @Nullable diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 337288ab59c60..f8370c9efe3e6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -12,7 +12,6 @@ import com.linkedin.common.AuditStamp; import com.linkedin.common.VersionedUrn; import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.StringArray; @@ -24,6 +23,7 @@ import com.linkedin.metadata.aspect.EnvelopedAspectArray; import com.linkedin.metadata.aspect.VersionedAspect; import com.linkedin.metadata.aspect.batch.AspectsBatch; +import com.linkedin.metadata.aspect.batch.BatchItem; import com.linkedin.metadata.browse.BrowseResult; import com.linkedin.metadata.browse.BrowseResultV2; import com.linkedin.metadata.entity.DeleteEntityService; @@ -48,6 +48,7 @@ import com.linkedin.metadata.search.client.CachingEntitySearchService; import com.linkedin.metadata.service.RollbackService; import com.linkedin.metadata.timeseries.TimeseriesAspectService; +import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.PlatformEvent; @@ -60,6 +61,7 @@ import java.net.URISyntaxException; import java.time.Clock; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -68,6 +70,7 @@ import java.util.Optional; import java.util.Set; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; @@ -738,35 +741,54 @@ public List getTimeseriesAspectValues( return response.getValues(); } - // TODO: Factor out ingest logic into a util that can be accessed by the java client and the - // resource @Override - public String ingestProposal( + @Nonnull + public List batchIngestProposals( @Nonnull OperationContext opContext, - @Nonnull final MetadataChangeProposal metadataChangeProposal, - final boolean async) - throws RemoteInvocationException { + @Nonnull Collection metadataChangeProposals, + boolean async) { String actorUrnStr = opContext.getSessionAuthentication().getActor() != null ? opContext.getSessionAuthentication().getActor().toUrnStr() : Constants.UNKNOWN_ACTOR; - final AuditStamp auditStamp = - new AuditStamp().setTime(_clock.millis()).setActor(UrnUtils.getUrn(actorUrnStr)); + final AuditStamp auditStamp = AuditStampUtils.createAuditStamp(actorUrnStr); AspectsBatch batch = AspectsBatchImpl.builder() - .mcps( - List.of(metadataChangeProposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get()) .build(); - Optional one = - entityService.ingestProposal(opContext, batch, async).stream().findFirst(); + Map> resultMap = + entityService.ingestProposal(opContext, batch, async).stream() + .collect(Collectors.groupingBy(IngestResult::getRequest)); + + // Update runIds + batch.getItems().stream() + .filter(resultMap::containsKey) + .forEach( + requestItem -> { + List results = resultMap.get(requestItem); + Optional resultUrn = + results.stream().map(IngestResult::getUrn).filter(Objects::nonNull).findFirst(); + resultUrn.ifPresent( + urn -> tryIndexRunId(opContext, urn, requestItem.getSystemMetadata())); + }); - Urn urn = one.map(IngestResult::getUrn).orElse(metadataChangeProposal.getEntityUrn()); - if (one.isPresent()) { - tryIndexRunId(opContext, urn, metadataChangeProposal.getSystemMetadata()); - } - return urn.toString(); + // Preserve ordering + return batch.getItems().stream() + .map( + requestItem -> { + if (resultMap.containsKey(requestItem)) { + List results = resultMap.get(requestItem); + return results.stream() + .filter(r -> r.getUrn() != null) + .findFirst() + .map(r -> r.getUrn().toString()) + .orElse(null); + } + return null; + }) + .collect(Collectors.toList()); } @SneakyThrows diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index 8821143cde6cc..cb5c691d0cb61 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -38,7 +38,6 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -519,27 +518,17 @@ default String ingestProposal( return ingestProposal(opContext, metadataChangeProposal, false); } - String ingestProposal( + /** + * Ingest a MetadataChangeProposal event. + * + * @return the urn string ingested + */ + default String ingestProposal( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeProposal metadataChangeProposal, final boolean async) - throws RemoteInvocationException; - - @Deprecated - default String wrappedIngestProposal( - @Nonnull OperationContext opContext, @Nonnull MetadataChangeProposal metadataChangeProposal) { - return wrappedIngestProposal(opContext, metadataChangeProposal, false); - } - - default String wrappedIngestProposal( - @Nonnull OperationContext opContext, - @Nonnull MetadataChangeProposal metadataChangeProposal, - final boolean async) { - try { - return ingestProposal(opContext, metadataChangeProposal, async); - } catch (RemoteInvocationException e) { - throw new RuntimeException(e); - } + throws RemoteInvocationException { + return batchIngestProposals(opContext, List.of(metadataChangeProposal), async).get(0); } @Deprecated @@ -550,15 +539,20 @@ default List batchIngestProposals( return batchIngestProposals(opContext, metadataChangeProposals, false); } - default List batchIngestProposals( + /** + * Ingest a list of proposals in a batch. + * + * @param opContext operation context + * @param metadataChangeProposals list of proposals + * @param async async or sync ingestion path + * @return ingested urns + */ + @Nonnull + List batchIngestProposals( @Nonnull OperationContext opContext, @Nonnull final Collection metadataChangeProposals, final boolean async) - throws RemoteInvocationException { - return metadataChangeProposals.stream() - .map(proposal -> wrappedIngestProposal(opContext, proposal, async)) - .collect(Collectors.toList()); - } + throws RemoteInvocationException; @Deprecated Optional getVersionedAspect( diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index fe1ca571efea5..2a3ae5d006ae0 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -12,7 +12,7 @@ import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.StringArray; import com.linkedin.entity.AspectsDoGetTimeseriesAspectValuesRequestBuilder; -import com.linkedin.entity.AspectsDoIngestProposalRequestBuilder; +import com.linkedin.entity.AspectsDoIngestProposalBatchRequestBuilder; import com.linkedin.entity.AspectsGetRequestBuilder; import com.linkedin.entity.AspectsRequestBuilders; import com.linkedin.entity.EntitiesBatchGetRequestBuilder; @@ -67,6 +67,7 @@ import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchResult; import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.MetadataChangeProposalArray; import com.linkedin.mxe.PlatformEvent; import com.linkedin.mxe.SystemMetadata; import com.linkedin.parseq.retry.backoff.BackoffPolicy; @@ -1047,23 +1048,23 @@ public List getTimeseriesAspectValues( .getValues(); } - /** - * Ingest a MetadataChangeProposal event. - * - * @return the urn string ingested - */ + @Nonnull @Override - public String ingestProposal( + public List batchIngestProposals( @Nonnull OperationContext opContext, - @Nonnull final MetadataChangeProposal metadataChangeProposal, - final boolean async) + @Nonnull Collection metadataChangeProposals, + boolean async) throws RemoteInvocationException { - final AspectsDoIngestProposalRequestBuilder requestBuilder = + final AspectsDoIngestProposalBatchRequestBuilder requestBuilder = ASPECTS_REQUEST_BUILDERS - .actionIngestProposal() - .proposalParam(metadataChangeProposal) + .actionIngestProposalBatch() + .proposalsParam(new MetadataChangeProposalArray(metadataChangeProposals)) .asyncParam(String.valueOf(async)); - return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); + String result = + sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); + return metadataChangeProposals.stream() + .map(proposal -> "success".equals(result) ? proposal.getEntityUrn().toString() : null) + .collect(Collectors.toList()); } @Override diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py index c9a0b62159314..bce7b8a238c38 100644 --- a/smoke-test/tests/privileges/test_privileges.py +++ b/smoke-test/tests/privileges/test_privileges.py @@ -4,11 +4,13 @@ from tests.privileges.utils import ( assign_role, assign_user_to_group, + clear_polices, create_group, create_user, create_user_policy, remove_group, remove_policy, + remove_secret, remove_user, set_base_platform_privileges_policy_status, set_view_dataset_sensitive_info_policy_status, @@ -65,6 +67,12 @@ def privileges_and_test_user_setup(admin_session): # Remove test user remove_user(admin_session, "urn:li:corpuser:user") + # Remove secret + remove_secret(admin_session, "urn:li:dataHubSecret:TestSecretName") + + # Remove test policies + clear_polices(admin_session) + # Restore All users privileges set_base_platform_privileges_policy_status("ACTIVE", admin_session) set_view_dataset_sensitive_info_policy_status("ACTIVE", admin_session) diff --git a/smoke-test/tests/privileges/utils.py b/smoke-test/tests/privileges/utils.py index 1e58ec4085b70..72ad94a42a462 100644 --- a/smoke-test/tests/privileges/utils.py +++ b/smoke-test/tests/privileges/utils.py @@ -246,8 +246,8 @@ def create_user_policy(user_urn, privileges, session): "variables": { "input": { "type": "PLATFORM", - "name": "Policy Name", - "description": "Policy Description", + "name": "Test Policy Name", + "description": "Test Policy Description", "state": "ACTIVE", "resources": {"filter": {"criteria": []}}, "privileges": privileges, @@ -288,3 +288,69 @@ def remove_policy(urn, session): assert res_data["data"] assert res_data["data"]["deletePolicy"] assert res_data["data"]["deletePolicy"] == urn + + +def clear_polices(session): + list_policy_json = { + "query": """query listPolicies($input: ListPoliciesInput!) { + listPolicies(input: $input) { + start + count + total + policies { + urn + editable + name + description + __typename + } + __typename + } + }""", + "variables": { + "input": { + "count": 100, + "start": 0, + "orFilters": [ + { + "and": [ + { + "field": "state", + "values": ["ACTIVE"], + "condition": "EQUAL", + }, + { + "field": "editable", + "values": ["true"], + "condition": "EQUAL", + }, + ] + } + ], + } + }, + } + + response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=list_policy_json + ) + response.raise_for_status() + res_data = response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["listPolicies"] + for policy in res_data["data"]["listPolicies"]["policies"]: + if "test" in policy["name"].lower() or "test" in policy["description"].lower(): + remove_policy(policy["urn"], session) + + +def remove_secret(session, urn): + remove_secret = { + "query": """mutation deleteSecret($urn: String!) {\n + deleteSecret(urn: $urn)\n}""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_secret) + response.raise_for_status() From a25df8e6a0ab5c36605e674380721edc8f72e95f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 7 Aug 2024 14:04:18 -0700 Subject: [PATCH 03/12] fix(snowflake): avoid reporting warnings/info for sys tables (#11114) --- .../src/datahub/ingestion/api/source.py | 2 +- .../source/snowflake/snowflake_schema_gen.py | 2 +- .../source/snowflake/snowflake_utils.py | 20 +++++++++++++------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index a4de8b382430c..3dea3d36f41f1 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -47,7 +47,7 @@ logger = logging.getLogger(__name__) -_MAX_CONTEXT_STRING_LENGTH = 300 +_MAX_CONTEXT_STRING_LENGTH = 1000 class SourceCapability(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 1d4a5b377da14..a64589bcfed02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -440,7 +440,7 @@ def _process_schema( yield from self._process_tag(tag) if not snowflake_schema.views and not snowflake_schema.tables: - self.structured_reporter.warning( + self.structured_reporter.info( title="No tables/views found in schema", message="If tables exist, please grant REFERENCES or SELECT permissions on them.", context=f"{db_name}.{schema_name}", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index a1878963d3798..0177d59ef6b21 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -127,6 +127,8 @@ def is_dataset_pattern_allowed( SnowflakeObjectDomain.MATERIALIZED_VIEW, ): return False + if _is_sys_table(dataset_name): + return False if len(dataset_params) != 3: self.structured_reporter.info( @@ -176,6 +178,11 @@ def _combine_identifier_parts( return f"{db_name}.{schema_name}.{table_name}" +def _is_sys_table(table_name: str) -> bool: + # Often will look like `SYS$_UNPIVOT_VIEW1737` or `sys$_pivot_view19`. + return table_name.lower().startswith("sys$") + + # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, # For example "test-database"."test-schema".test_table # whereas we generate urns without quotes even for quoted identifiers for backward compatibility @@ -186,12 +193,13 @@ def _cleanup_qualified_name( ) -> str: name_parts = qualified_name.split(".") if len(name_parts) != 3: - structured_reporter.info( - title="Unexpected dataset pattern", - message="We failed to parse a Snowflake qualified name into its constituent parts. " - "DB/schema/table filtering may not work as expected on these entities.", - context=f"{qualified_name} has {len(name_parts)} parts", - ) + if not _is_sys_table(qualified_name): + structured_reporter.info( + title="Unexpected dataset pattern", + message="We failed to parse a Snowflake qualified name into its constituent parts. " + "DB/schema/table filtering may not work as expected on these entities.", + context=f"{qualified_name} has {len(name_parts)} parts", + ) return qualified_name.replace('"', "") return _combine_identifier_parts( db_name=name_parts[0].strip('"'), From d6e46b9bcf3b8e8b1e8719fb352f5837bf6b402c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 7 Aug 2024 14:57:05 -0700 Subject: [PATCH 04/12] fix(ingest): downgrade column type mapping warning to info (#11115) --- .../datahub/ingestion/source/abs/source.py | 74 +---------------- .../ingestion/source/dbt/dbt_common.py | 7 +- .../src/datahub/ingestion/source/s3/source.py | 79 +------------------ .../ingestion/source/sql/sql_common.py | 7 +- 4 files changed, 15 insertions(+), 152 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py index 39ebd79c2e226..66f268799b2f1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py @@ -8,29 +8,10 @@ from collections import OrderedDict from datetime import datetime from pathlib import PurePath -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple import smart_open.compression as so_compression from more_itertools import peekable -from pyspark.sql.types import ( - ArrayType, - BinaryType, - BooleanType, - ByteType, - DateType, - DecimalType, - DoubleType, - FloatType, - IntegerType, - LongType, - MapType, - NullType, - ShortType, - StringType, - StructField, - StructType, - TimestampType, -) from smart_open import open as smart_open from datahub.emitter.mce_builder import ( @@ -48,7 +29,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.abs.config import DataLakeSourceConfig, PathSpec from datahub.ingestion.source.abs.report import DataLakeSourceReport @@ -72,22 +53,14 @@ StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( - BooleanTypeClass, - BytesTypeClass, - DateTypeClass, - NullTypeClass, - NumberTypeClass, - RecordTypeClass, SchemaField, SchemaFieldDataType, SchemaMetadata, StringTypeClass, - TimeTypeClass, ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, DatasetPropertiesClass, - MapTypeClass, OperationClass, OperationTypeClass, OtherSchemaClass, @@ -100,55 +73,12 @@ logging.getLogger("py4j").setLevel(logging.ERROR) logger: logging.Logger = logging.getLogger(__name__) -# for a list of all types, see https://spark.apache.org/docs/3.0.3/api/python/_modules/pyspark/sql/types.html -_field_type_mapping = { - NullType: NullTypeClass, - StringType: StringTypeClass, - BinaryType: BytesTypeClass, - BooleanType: BooleanTypeClass, - DateType: DateTypeClass, - TimestampType: TimeTypeClass, - DecimalType: NumberTypeClass, - DoubleType: NumberTypeClass, - FloatType: NumberTypeClass, - ByteType: BytesTypeClass, - IntegerType: NumberTypeClass, - LongType: NumberTypeClass, - ShortType: NumberTypeClass, - ArrayType: NullTypeClass, - MapType: MapTypeClass, - StructField: RecordTypeClass, - StructType: RecordTypeClass, -} PAGE_SIZE = 1000 # Hack to support the .gzip extension with smart_open. so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"]) -def get_column_type( - report: SourceReport, dataset_name: str, column_type: str -) -> SchemaFieldDataType: - """ - Maps known Spark types to datahub types - """ - TypeClass: Any = None - - for field_type, type_class in _field_type_mapping.items(): - if isinstance(column_type, field_type): - TypeClass = type_class - break - - # if still not found, report the warning - if TypeClass is None: - report.report_warning( - dataset_name, f"unable to map type {column_type} to metadata schema" - ) - TypeClass = NullTypeClass - - return SchemaFieldDataType(type=TypeClass()) - - # config flags to emit telemetry for config_options_to_report = [ "platform", diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index ead86acc299ca..e2b5f8378732c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -849,8 +849,11 @@ def get_column_type( # if still not found, report the warning if TypeClass is None: if column_type: - report.report_warning( - dataset_name, f"unable to map type {column_type} to metadata schema" + report.info( + title="Unable to map column types to DataHub types", + message="Got an unexpected column type. The column's parsed field type will not be populated.", + context=f"{dataset_name} - {column_type}", + log=False, ) TypeClass = NullTypeClass diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index b8c7fd5aa88fc..f81d06c35e3b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -8,32 +8,13 @@ from collections import OrderedDict from datetime import datetime from pathlib import PurePath -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple import smart_open.compression as so_compression from more_itertools import peekable from pyspark.conf import SparkConf from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from pyspark.sql.types import ( - ArrayType, - BinaryType, - BooleanType, - ByteType, - DateType, - DecimalType, - DoubleType, - FloatType, - IntegerType, - LongType, - MapType, - NullType, - ShortType, - StringType, - StructField, - StructType, - TimestampType, -) from pyspark.sql.utils import AnalysisException from smart_open import open as smart_open @@ -52,7 +33,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders from datahub.ingestion.source.aws.s3_util import ( @@ -72,22 +53,13 @@ StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( - BooleanTypeClass, - BytesTypeClass, - DateTypeClass, - NullTypeClass, - NumberTypeClass, - RecordTypeClass, SchemaField, - SchemaFieldDataType, SchemaMetadata, StringTypeClass, - TimeTypeClass, ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, DatasetPropertiesClass, - MapTypeClass, OperationClass, OperationTypeClass, OtherSchemaClass, @@ -101,55 +73,12 @@ logging.getLogger("py4j").setLevel(logging.ERROR) logger: logging.Logger = logging.getLogger(__name__) -# for a list of all types, see https://spark.apache.org/docs/3.0.3/api/python/_modules/pyspark/sql/types.html -_field_type_mapping = { - NullType: NullTypeClass, - StringType: StringTypeClass, - BinaryType: BytesTypeClass, - BooleanType: BooleanTypeClass, - DateType: DateTypeClass, - TimestampType: TimeTypeClass, - DecimalType: NumberTypeClass, - DoubleType: NumberTypeClass, - FloatType: NumberTypeClass, - ByteType: BytesTypeClass, - IntegerType: NumberTypeClass, - LongType: NumberTypeClass, - ShortType: NumberTypeClass, - ArrayType: NullTypeClass, - MapType: MapTypeClass, - StructField: RecordTypeClass, - StructType: RecordTypeClass, -} PAGE_SIZE = 1000 # Hack to support the .gzip extension with smart_open. so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"]) -def get_column_type( - report: SourceReport, dataset_name: str, column_type: str -) -> SchemaFieldDataType: - """ - Maps known Spark types to datahub types - """ - TypeClass: Any = None - - for field_type, type_class in _field_type_mapping.items(): - if isinstance(column_type, field_type): - TypeClass = type_class - break - - # if still not found, report the warning - if TypeClass is None: - report.report_warning( - dataset_name, f"unable to map type {column_type} to metadata schema" - ) - TypeClass = NullTypeClass - - return SchemaFieldDataType(type=TypeClass()) - - # config flags to emit telemetry for config_options_to_report = [ "platform", @@ -490,9 +419,7 @@ def add_partition_columns_to_schema( if not is_fieldpath_v2 else f"[version=2.0].[type=string].{partition_key}", nativeDataType="string", - type=SchemaFieldDataType(StringTypeClass()) - if not is_fieldpath_v2 - else SchemaFieldDataTypeClass(type=StringTypeClass()), + type=SchemaFieldDataTypeClass(StringTypeClass()), isPartitioningKey=True, nullable=True, recursive=False, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 1fa308eae6b76..2ab1e6bb41af1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -263,8 +263,11 @@ def get_column_type( break if TypeClass is None: - sql_report.report_warning( - dataset_name, f"unable to map type {column_type!r} to metadata schema" + sql_report.info( + title="Unable to map column types to DataHub types", + message="Got an unexpected column type. The column's parsed field type will not be populated.", + context=f"{dataset_name} - {column_type!r}", + log=False, ) TypeClass = NullTypeClass From e08412e513215405902a61713895bbefa2ed624e Mon Sep 17 00:00:00 2001 From: Ajoy Majumdar Date: Thu, 8 Aug 2024 08:00:38 -0700 Subject: [PATCH 05/12] feat(api): add AuditStamp to the V3 API entity/aspect response (#11118) --- .../openapi/v3/models/AspectItem.java | 15 +++++++ .../openapi/v3/models/GenericAspectV3.java | 1 + .../openapi/v3/models/GenericEntityV3.java | 19 ++++---- .../openapi/v3/OpenAPIV3Generator.java | 26 +++++++---- .../v3/controller/EntityController.java | 44 ++++++++++++++----- 5 files changed, 77 insertions(+), 28 deletions(-) create mode 100644 metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/AspectItem.java diff --git a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/AspectItem.java b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/AspectItem.java new file mode 100644 index 0000000000000..ec5dff7817231 --- /dev/null +++ b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/AspectItem.java @@ -0,0 +1,15 @@ +package io.datahubproject.openapi.v3.models; + +import com.linkedin.common.AuditStamp; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.mxe.SystemMetadata; +import lombok.Builder; +import lombok.Value; + +@Builder(toBuilder = true) +@Value +public class AspectItem { + RecordTemplate aspect; + SystemMetadata systemMetadata; + AuditStamp auditStamp; +} diff --git a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericAspectV3.java b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericAspectV3.java index 4db2c3288d154..70bf2182c29f4 100644 --- a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericAspectV3.java +++ b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericAspectV3.java @@ -19,4 +19,5 @@ public class GenericAspectV3 implements GenericAspect { @Nonnull Map value; @Nullable Map systemMetadata; @Nullable Map headers; + @Nullable Map auditStamp; } diff --git a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericEntityV3.java b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericEntityV3.java index 3af3b25028fad..54d6ac2c1736f 100644 --- a/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericEntityV3.java +++ b/metadata-service/openapi-servlet/models/src/main/java/io/datahubproject/openapi/v3/models/GenericEntityV3.java @@ -5,9 +5,6 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.common.urn.Urn; -import com.linkedin.data.template.RecordTemplate; -import com.linkedin.mxe.SystemMetadata; -import com.linkedin.util.Pair; import io.datahubproject.openapi.models.GenericEntity; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -42,9 +39,7 @@ public Map getAspects() { public static class GenericEntityV3Builder { public GenericEntityV3 build( - ObjectMapper objectMapper, - @Nonnull Urn urn, - Map> aspects) { + ObjectMapper objectMapper, @Nonnull Urn urn, Map aspects) { Map jsonObjectMap = aspects.entrySet().stream() .map( @@ -53,13 +48,18 @@ public GenericEntityV3 build( String aspectName = entry.getKey(); Map aspectValue = objectMapper.readValue( - RecordUtils.toJsonString(entry.getValue().getFirst()) + RecordUtils.toJsonString(entry.getValue().getAspect()) .getBytes(StandardCharsets.UTF_8), new TypeReference<>() {}); Map systemMetadata = - entry.getValue().getSecond() != null + entry.getValue().getSystemMetadata() != null ? objectMapper.convertValue( - entry.getValue().getSecond(), new TypeReference<>() {}) + entry.getValue().getSystemMetadata(), new TypeReference<>() {}) + : null; + Map auditStamp = + entry.getValue().getAuditStamp() != null + ? objectMapper.convertValue( + entry.getValue().getAuditStamp().data(), new TypeReference<>() {}) : null; return Map.entry( @@ -67,6 +67,7 @@ public GenericEntityV3 build( GenericAspectV3.builder() .value(aspectValue) .systemMetadata(systemMetadata) + .auditStamp(auditStamp) .build()); } catch (IOException ex) { throw new RuntimeException(ex); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java index f26ad6821c583..f6f248be77c67 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/OpenAPIV3Generator.java @@ -40,7 +40,7 @@ public class OpenAPIV3Generator { private static final String NAME_QUERY = "query"; private static final String NAME_PATH = "path"; private static final String NAME_SYSTEM_METADATA = "systemMetadata"; - private static final String NAME_ASYNC = "async"; + private static final String NAME_AUDIT_STAMP = "auditStamp"; private static final String NAME_VERSION = "version"; private static final String NAME_SCROLL_ID = "scrollId"; private static final String NAME_INCLUDE_SOFT_DELETE = "includeSoftDelete"; @@ -77,9 +77,6 @@ public static OpenAPI generateOpenApiSpec(EntityRegistry entityRegistry) { // Components final Components components = new Components(); // --> Aspect components - // TODO: Correct handling of SystemMetadata and SortOrder - components.addSchemas( - "SystemMetadata", new Schema().type(TYPE_OBJECT).additionalProperties(true)); components.addSchemas("SortOrder", new Schema()._enum(List.of("ASCENDING", "DESCENDING"))); components.addSchemas("AspectPatch", buildAspectPatchSchema()); components.addSchemas( @@ -167,6 +164,10 @@ public static OpenAPI generateOpenApiSpec(EntityRegistry entityRegistry) { buildSingleEntityAspectPath( e, a.getName(), a.getPegasusSchema().getName()))); }); + // TODO: Correct handling of SystemMetadata and AuditStamp + components.addSchemas( + "SystemMetadata", new Schema().type(TYPE_OBJECT).additionalProperties(true)); + components.addSchemas("AuditStamp", new Schema().type(TYPE_OBJECT).additionalProperties(true)); return new OpenAPI().openapi("3.0.1").info(info).paths(paths).components(components); } @@ -185,7 +186,7 @@ private static PathItem buildSingleEntityPath(final EntitySpec entity) { .schema(new Schema().type(TYPE_STRING)), new Parameter() .in(NAME_QUERY) - .name("systemMetadata") + .name(NAME_SYSTEM_METADATA) .description("Include systemMetadata with response.") .schema(new Schema().type(TYPE_BOOLEAN)._default(false)), new Parameter() @@ -424,7 +425,7 @@ private static PathItem buildBatchGetEntityPath(final EntitySpec entity) { List.of( new Parameter() .in(NAME_QUERY) - .name("systemMetadata") + .name(NAME_SYSTEM_METADATA) .description("Include systemMetadata with response.") .schema(new Schema().type(TYPE_BOOLEAN)._default(false)))) .requestBody( @@ -575,12 +576,19 @@ private static Schema buildAspectRefResponseSchema(final String aspectName) { .required(List.of(PROPERTY_VALUE)) .addProperty(PROPERTY_VALUE, new Schema<>().$ref(PATH_DEFINITIONS + aspectName)); result.addProperty( - "systemMetadata", + NAME_SYSTEM_METADATA, new Schema<>() .type(TYPE_OBJECT) .anyOf(List.of(new Schema().$ref(PATH_DEFINITIONS + "SystemMetadata"))) .description("System metadata for the aspect.") .nullable(true)); + result.addProperty( + NAME_AUDIT_STAMP, + new Schema<>() + .type(TYPE_OBJECT) + .anyOf(List.of(new Schema().$ref(PATH_DEFINITIONS + "AuditStamp"))) + .description("Audit stamp for the aspect.") + .nullable(true)); return result; } @@ -592,7 +600,7 @@ private static Schema buildAspectRefRequestSchema(final String aspectName) { .required(List.of(PROPERTY_VALUE)) .addProperty(PROPERTY_VALUE, new Schema<>().$ref(PATH_DEFINITIONS + aspectName)); result.addProperty( - "systemMetadata", + NAME_SYSTEM_METADATA, new Schema<>() .type(TYPE_OBJECT) .anyOf(List.of(new Schema().$ref(PATH_DEFINITIONS + "SystemMetadata"))) @@ -867,7 +875,7 @@ private static PathItem buildSingleEntityAspectPath( List.of( new Parameter() .in(NAME_QUERY) - .name("systemMetadata") + .name(NAME_SYSTEM_METADATA) .description("Include systemMetadata with response.") .schema(new Schema().type(TYPE_BOOLEAN)._default(false)))) .summary(String.format("Patch aspect %s on %s ", aspect, upperFirstEntity)) diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index 9ca34934e4c65..a0478c9af1609 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -13,7 +13,6 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.urn.Urn; import com.linkedin.data.ByteString; -import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.aspect.batch.BatchItem; @@ -28,12 +27,12 @@ import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.GenericRecordUtils; import com.linkedin.mxe.SystemMetadata; -import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; import io.datahubproject.openapi.controller.GenericEntitiesController; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; +import io.datahubproject.openapi.v3.models.AspectItem; import io.datahubproject.openapi.v3.models.GenericAspectV3; import io.datahubproject.openapi.v3.models.GenericEntityScrollResultV3; import io.datahubproject.openapi.v3.models.GenericEntityV3; @@ -143,11 +142,27 @@ protected List buildEntityVersionedAspectList( .map( u -> GenericEntityV3.builder() - .build(objectMapper, u, toAspectMap(u, aspects.get(u), withSystemMetadata))) + .build( + objectMapper, u, toAspectItemMap(u, aspects.get(u), withSystemMetadata))) .collect(Collectors.toList()); } } + private Map toAspectItemMap( + Urn urn, List aspects, boolean withSystemMetadata) { + return aspects.stream() + .map( + a -> + Map.entry( + a.getName(), + AspectItem.builder() + .aspect(toRecordTemplate(lookupAspectSpec(urn, a.getName()), a)) + .systemMetadata(withSystemMetadata ? a.getSystemMetadata() : null) + .auditStamp(withSystemMetadata ? a.getCreated() : null) + .build())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + @Override protected List buildEntityList( Set ingestResults, boolean withSystemMetadata) { @@ -156,15 +171,21 @@ protected List buildEntityList( Map> entityMap = ingestResults.stream().collect(Collectors.groupingBy(IngestResult::getUrn)); for (Map.Entry> urnAspects : entityMap.entrySet()) { - Map> aspectsMap = + Map aspectsMap = urnAspects.getValue().stream() .map( ingest -> Map.entry( ingest.getRequest().getAspectName(), - Pair.of( - ingest.getRequest().getRecordTemplate(), - withSystemMetadata ? ingest.getRequest().getSystemMetadata() : null))) + AspectItem.builder() + .aspect(ingest.getRequest().getRecordTemplate()) + .systemMetadata( + withSystemMetadata + ? ingest.getRequest().getSystemMetadata() + : null) + .auditStamp( + withSystemMetadata ? ingest.getRequest().getAuditStamp() : null) + .build())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); responseList.add( GenericEntityV3.builder().build(objectMapper, urnAspects.getKey(), aspectsMap)); @@ -183,9 +204,12 @@ protected GenericEntityV3 buildGenericEntity( updateAspectResult.getUrn(), Map.of( aspectName, - Pair.of( - updateAspectResult.getNewValue(), - withSystemMetadata ? updateAspectResult.getNewSystemMetadata() : null))); + AspectItem.builder() + .aspect(updateAspectResult.getNewValue()) + .systemMetadata( + withSystemMetadata ? updateAspectResult.getNewSystemMetadata() : null) + .auditStamp(withSystemMetadata ? updateAspectResult.getAuditStamp() : null) + .build())); } private List toRecordTemplates( From a4a887c866c362ecdd9ccb9e4df2591a01b90a3f Mon Sep 17 00:00:00 2001 From: AndreasHegerNuritas <163423418+AndreasHegerNuritas@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:38:16 +0100 Subject: [PATCH 06/12] =?UTF-8?q?fix(ingest/redshift):=20replace=20r'\n'?= =?UTF-8?q?=20with=20'\n'=20to=20avoid=20token=20error=20redshift=20server?= =?UTF-8?q?less=E2=80=A6=20(#11111)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datahub/ingestion/source/redshift/redshift_schema.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py index 6e88a50f898a5..2e628269edbc3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py @@ -504,7 +504,11 @@ def get_alter_table_commands( yield AlterTableRow( transaction_id=row[field_names.index("transaction_id")], session_id=session_id, - query_text=row[field_names.index("query_text")], + # See https://docs.aws.amazon.com/redshift/latest/dg/r_STL_QUERYTEXT.html + # for why we need to replace the \n with a newline. + query_text=row[field_names.index("query_text")].replace( + r"\n", "\n" + ), start_time=row[field_names.index("start_time")], ) rows = cursor.fetchmany() From 3d9a9541f1ff37ea80dd9d7d44fe501909269495 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:54:14 -0500 Subject: [PATCH 07/12] fix(entiy-client): handle null entityUrn case for restli (#11122) --- .../entity/client/RestliEntityClient.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 2a3ae5d006ae0..780c6c6a007c2 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -50,6 +50,7 @@ import com.linkedin.metadata.browse.BrowseResult; import com.linkedin.metadata.browse.BrowseResultV2; import com.linkedin.metadata.graph.LineageDirection; +import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.query.AutoCompleteResult; import com.linkedin.metadata.query.LineageFlags; import com.linkedin.metadata.query.ListResult; @@ -66,6 +67,7 @@ import com.linkedin.metadata.search.LineageSearchResult; import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.utils.EntityKeyUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.MetadataChangeProposalArray; import com.linkedin.mxe.PlatformEvent; @@ -1063,7 +1065,20 @@ public List batchIngestProposals( String result = sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); return metadataChangeProposals.stream() - .map(proposal -> "success".equals(result) ? proposal.getEntityUrn().toString() : null) + .map( + proposal -> { + if ("success".equals(result)) { + if (proposal.getEntityUrn() != null) { + return proposal.getEntityUrn().toString(); + } else { + EntitySpec entitySpec = + opContext.getEntityRegistry().getEntitySpec(proposal.getEntityType()); + return EntityKeyUtils.getUrnFromProposal(proposal, entitySpec.getKeyAspectSpec()) + .toString(); + } + } + return null; + }) .collect(Collectors.toList()); } From 840b15083a17c5347c63b0e74b079e7b5ea70a1e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 8 Aug 2024 14:05:55 -0700 Subject: [PATCH 08/12] fix(sql-parser): prevent bad urns from alter table lineage (#11092) --- .../goldens/v2_sqlite_operator.json | 112 +++++++++--------- .../v2_sqlite_operator_no_dag_listener.json | 64 +++++----- .../datahub/sql_parsing/sqlglot_lineage.py | 62 ++++++---- .../testing/check_sql_parser_result.py | 1 - .../test_bigquery_alter_table_column.json | 14 +++ .../goldens/test_snowflake_drop_schema.json | 12 ++ .../goldens/test_sqlite_drop_table.json | 14 +++ .../goldens/test_sqlite_drop_view.json | 14 +++ .../unit/sql_parsing/test_sqlglot_lineage.py | 51 ++++++++ 9 files changed, 231 insertions(+), 113 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_alter_table_column.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_drop_schema.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_table.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_view.json diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json index e7902d165051b..4bc34b7b0d3ce 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -350,8 +350,8 @@ "json": { "timestampMillis": 1717179743558, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -367,8 +367,8 @@ "json": { "timestampMillis": 1717179743932, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "actor": "urn:li:corpuser:airflow", "operationType": "CREATE", @@ -552,8 +552,8 @@ "json": { "timestampMillis": 1717179743960, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -742,8 +742,8 @@ "json": { "timestampMillis": 1717179748679, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -759,8 +759,8 @@ "json": { "timestampMillis": 1717179749258, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "actor": "urn:li:corpuser:airflow", "operationType": "CREATE", @@ -875,8 +875,8 @@ "json": { "timestampMillis": 1717179749324, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1161,8 +1161,8 @@ "json": { "timestampMillis": 1717179757397, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1178,8 +1178,8 @@ "json": { "timestampMillis": 1717179758424, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "actor": "urn:li:corpuser:airflow", "operationType": "CREATE", @@ -1420,8 +1420,8 @@ "json": { "timestampMillis": 1717179758496, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1483,10 +1483,10 @@ "aspectName": "dataJobInputOutput", "aspect": { "json": { - "inputDatasets": [ + "inputDatasets": [], + "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], - "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" ], @@ -1555,6 +1555,19 @@ } } }, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", @@ -1640,19 +1653,6 @@ } } }, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceInput", - "aspect": { - "json": { - "inputs": [ - "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" - ] - } - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", @@ -1662,8 +1662,8 @@ "json": { "timestampMillis": 1718733767964, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1679,8 +1679,8 @@ "json": { "timestampMillis": 1718733768638, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1697,10 +1697,10 @@ "aspectName": "dataJobInputOutput", "aspect": { "json": { - "inputDatasets": [ + "inputDatasets": [], + "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" ], - "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" ], @@ -1809,19 +1809,6 @@ } } }, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceInput", - "aspect": { - "json": { - "inputs": [ - "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" - ] - } - } -}, { "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", @@ -1843,8 +1830,8 @@ "json": { "timestampMillis": 1718733773354, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1860,8 +1847,8 @@ "json": { "timestampMillis": 1718733774147, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1870,5 +1857,18 @@ } } } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } } ] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json index a9af068e2e4e9..99bda0e0f2569 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -336,8 +336,8 @@ "json": { "timestampMillis": 1717180072004, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -382,8 +382,8 @@ "json": { "timestampMillis": 1719864194882, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "actor": "urn:li:corpuser:airflow", "operationType": "CREATE", @@ -435,8 +435,8 @@ "json": { "timestampMillis": 1717180072275, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -641,8 +641,8 @@ "json": { "timestampMillis": 1717180078196, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -722,8 +722,8 @@ "json": { "timestampMillis": 1717180078619, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1000,8 +1000,8 @@ "json": { "timestampMillis": 1717180084642, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1081,8 +1081,8 @@ "json": { "timestampMillis": 1717180085266, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1186,10 +1186,10 @@ "aspectName": "dataJobInputOutput", "aspect": { "json": { - "inputDatasets": [ + "inputDatasets": [], + "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], - "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" ], @@ -1287,8 +1287,8 @@ "json": { "timestampMillis": 1717180091148, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1368,8 +1368,8 @@ "json": { "timestampMillis": 1717180091923, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1499,10 +1499,10 @@ "aspectName": "dataJobInputOutput", "aspect": { "json": { - "inputDatasets": [ + "inputDatasets": [], + "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" ], - "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" ], @@ -1613,8 +1613,8 @@ "json": { "timestampMillis": 1717180096108, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED", "attempt": 1 @@ -1630,8 +1630,8 @@ "json": { "timestampMillis": 1719864203487, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "actor": "urn:li:corpuser:airflow", "operationType": "CREATE", @@ -1712,8 +1712,8 @@ "json": { "timestampMillis": 1717180096993, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1727,10 +1727,10 @@ "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", "changeType": "UPSERT", - "aspectName": "dataProcessInstanceInput", + "aspectName": "dataProcessInstanceOutput", "aspect": { "json": { - "inputs": [ + "outputs": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" ] } @@ -1740,10 +1740,10 @@ "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", "changeType": "UPSERT", - "aspectName": "dataProcessInstanceInput", + "aspectName": "dataProcessInstanceOutput", "aspect": { "json": { - "inputs": [ + "outputs": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ] } diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 976ff8bcc9b3f..0146343002171 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -189,35 +189,49 @@ def _table_level_lineage( statement: sqlglot.Expression, dialect: sqlglot.Dialect ) -> Tuple[Set[_TableName], Set[_TableName]]: # Generate table-level lineage. - modified = { - _TableName.from_sqlglot_table(expr.this) - for expr in statement.find_all( - sqlglot.exp.Create, - sqlglot.exp.Insert, - sqlglot.exp.Update, - sqlglot.exp.Delete, - sqlglot.exp.Merge, - ) - # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)", - # the `this` on the INSERT part isn't a table. - if isinstance(expr.this, sqlglot.exp.Table) - } | { - # For statements that include a column list, like - # CREATE DDL statements and `INSERT INTO table (col1, col2) SELECT ...` - # the table name is nested inside a Schema object. - _TableName.from_sqlglot_table(expr.this.this) - for expr in statement.find_all( - sqlglot.exp.Create, - sqlglot.exp.Insert, - ) - if isinstance(expr.this, sqlglot.exp.Schema) - and isinstance(expr.this.this, sqlglot.exp.Table) - } + modified = ( + { + _TableName.from_sqlglot_table(expr.this) + for expr in statement.find_all( + sqlglot.exp.Create, + sqlglot.exp.Insert, + sqlglot.exp.Update, + sqlglot.exp.Delete, + sqlglot.exp.Merge, + sqlglot.exp.AlterTable, + ) + # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)", + # the `this` on the INSERT part isn't a table. + if isinstance(expr.this, sqlglot.exp.Table) + } + | { + # For statements that include a column list, like + # CREATE DDL statements and `INSERT INTO table (col1, col2) SELECT ...` + # the table name is nested inside a Schema object. + _TableName.from_sqlglot_table(expr.this.this) + for expr in statement.find_all( + sqlglot.exp.Create, + sqlglot.exp.Insert, + ) + if isinstance(expr.this, sqlglot.exp.Schema) + and isinstance(expr.this.this, sqlglot.exp.Table) + } + | { + # For drop statements, we only want it if a table/view is being dropped. + # Other "kinds" will not have table.name populated. + _TableName.from_sqlglot_table(expr.this) + for expr in ([statement] if isinstance(statement, sqlglot.exp.Drop) else []) + if isinstance(expr.this, sqlglot.exp.Table) + and expr.this.this + and expr.this.name + } + ) tables = ( { _TableName.from_sqlglot_table(table) for table in statement.find_all(sqlglot.exp.Table) + if not isinstance(table.parent, sqlglot.exp.Drop) } # ignore references created in this query - modified diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 39c0dddd31400..72b5f6c5e26e4 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -15,7 +15,6 @@ logger = logging.getLogger(__name__) -# TODO: Hook this into the standard --update-golden-files mechanism. UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true" diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_alter_table_column.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_alter_table_column.json new file mode 100644 index 0000000000000..3c6c9737e8e19 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_alter_table_column.json @@ -0,0 +1,14 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": "7d04253c3add0194c557942ef9b7485f38e68762d300dad364b9cec8656035b3", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-bq-project.covid_data.covid_deaths,PROD)" + ], + "column_lineage": null, + "debug_info": { + "confidence": 0.2, + "generalized_statement": "ALTER TABLE `my-bq-project.covid_data.covid_deaths` DROP COLUMN patient_name" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_drop_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_drop_schema.json new file mode 100644 index 0000000000000..2784b8e9543b2 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_drop_schema.json @@ -0,0 +1,12 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": "4eefab57619a812a94030acce0071857561265945e79d798563adb53bd0b9646", + "in_tables": [], + "out_tables": [], + "column_lineage": null, + "debug_info": { + "confidence": 0.9, + "generalized_statement": "DROP SCHEMA my_schema" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_table.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_table.json new file mode 100644 index 0000000000000..ae8b3f99897dc --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_table.json @@ -0,0 +1,14 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": "d1c29ad73325b08bb66e62ec00ba1d5be4412ec72b4bbc9c094f1272b9da4f86", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,my_schema.my_table,PROD)" + ], + "column_lineage": null, + "debug_info": { + "confidence": 0.2, + "generalized_statement": "DROP TABLE my_schema.my_table" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_view.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_view.json new file mode 100644 index 0000000000000..6650ef396a570 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_drop_view.json @@ -0,0 +1,14 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": "35a3c60e7ed98884dde3f1f5fe9079f844832430589a3326b97d617b8303f191", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,my_schema.my_view,PROD)" + ], + "column_lineage": null, + "debug_info": { + "confidence": 0.2, + "generalized_statement": "DROP VIEW my_schema.my_view" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index e5b669329f16c..3096c9b8269a1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -2,11 +2,22 @@ import pytest +import datahub.testing.check_sql_parser_result as checker from datahub.testing.check_sql_parser_result import assert_sql_result RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens" +@pytest.fixture(autouse=True) +def set_update_sql_parser( + pytestconfig: pytest.Config, monkeypatch: pytest.MonkeyPatch +) -> None: + update_golden = pytestconfig.getoption("--update-golden-files") + + if update_golden: + monkeypatch.setattr(checker, "UPDATE_FILES", True) + + def test_invalid_sql(): assert_sql_result( """ @@ -1202,3 +1213,43 @@ def test_bigquery_information_schema_query() -> None: dialect="bigquery", expected_file=RESOURCE_DIR / "test_bigquery_information_schema_query.json", ) + + +def test_bigquery_alter_table_column() -> None: + assert_sql_result( + """\ +ALTER TABLE `my-bq-project.covid_data.covid_deaths` drop COLUMN patient_name + """, + dialect="bigquery", + expected_file=RESOURCE_DIR / "test_bigquery_alter_table_column.json", + ) + + +def test_sqlite_drop_table() -> None: + assert_sql_result( + """\ +DROP TABLE my_schema.my_table +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_drop_table.json", + ) + + +def test_sqlite_drop_view() -> None: + assert_sql_result( + """\ +DROP VIEW my_schema.my_view +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_drop_view.json", + ) + + +def test_snowflake_drop_schema() -> None: + assert_sql_result( + """\ +DROP SCHEMA my_schema +""", + dialect="snowflake", + expected_file=RESOURCE_DIR / "test_snowflake_drop_schema.json", + ) From 78336c9f58fb89a25f4228b2e8b5c0322d66807f Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 9 Aug 2024 09:18:51 +0530 Subject: [PATCH 09/12] fix(ingest/bigquery): use small batch size if use_tables_list_query_v2 is set (#11121) --- .../datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 46ec75edb9734..c6a50a1c977f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -985,7 +985,7 @@ def get_tables_for_dataset( # https://cloud.google.com/bigquery/docs/information-schema-partitions max_batch_size: int = ( self.config.number_of_datasets_process_in_batch - if not self.config.is_profiling_enabled() + if not self.config.have_table_data_read_permission else self.config.number_of_datasets_process_in_batch_if_profiling_enabled ) From aa07e2a9371e8a5c90c87217dfc136dda38f60f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:59:38 +0200 Subject: [PATCH 10/12] fix(graphql): add missing entities to EntityTypeMapper and EntityTypeUrnMapper (#10366) --- .../types/entitytype/EntityTypeMapper.java | 33 ++++++++------ .../types/entitytype/EntityTypeUrnMapper.java | 43 ++++++++++++++++--- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java index 26835f9e57dcd..77457a814bd67 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java @@ -15,40 +15,49 @@ public class EntityTypeMapper { static final Map ENTITY_TYPE_TO_NAME = ImmutableMap.builder() + .put(EntityType.DOMAIN, Constants.DOMAIN_ENTITY_NAME) .put(EntityType.DATASET, Constants.DATASET_ENTITY_NAME) - .put(EntityType.ROLE, Constants.ROLE_ENTITY_NAME) .put(EntityType.CORP_USER, Constants.CORP_USER_ENTITY_NAME) .put(EntityType.CORP_GROUP, Constants.CORP_GROUP_ENTITY_NAME) .put(EntityType.DATA_PLATFORM, Constants.DATA_PLATFORM_ENTITY_NAME) + .put(EntityType.ER_MODEL_RELATIONSHIP, Constants.ER_MODEL_RELATIONSHIP_ENTITY_NAME) .put(EntityType.DASHBOARD, Constants.DASHBOARD_ENTITY_NAME) + .put(EntityType.NOTEBOOK, Constants.NOTEBOOK_ENTITY_NAME) .put(EntityType.CHART, Constants.CHART_ENTITY_NAME) - .put(EntityType.TAG, Constants.TAG_ENTITY_NAME) .put(EntityType.DATA_FLOW, Constants.DATA_FLOW_ENTITY_NAME) .put(EntityType.DATA_JOB, Constants.DATA_JOB_ENTITY_NAME) - .put(EntityType.DATA_PROCESS_INSTANCE, Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME) + .put(EntityType.TAG, Constants.TAG_ENTITY_NAME) .put(EntityType.GLOSSARY_TERM, Constants.GLOSSARY_TERM_ENTITY_NAME) .put(EntityType.GLOSSARY_NODE, Constants.GLOSSARY_NODE_ENTITY_NAME) + .put(EntityType.CONTAINER, Constants.CONTAINER_ENTITY_NAME) .put(EntityType.MLMODEL, Constants.ML_MODEL_ENTITY_NAME) .put(EntityType.MLMODEL_GROUP, Constants.ML_MODEL_GROUP_ENTITY_NAME) .put(EntityType.MLFEATURE_TABLE, Constants.ML_FEATURE_TABLE_ENTITY_NAME) .put(EntityType.MLFEATURE, Constants.ML_FEATURE_ENTITY_NAME) .put(EntityType.MLPRIMARY_KEY, Constants.ML_PRIMARY_KEY_ENTITY_NAME) - .put(EntityType.CONTAINER, Constants.CONTAINER_ENTITY_NAME) - .put(EntityType.DOMAIN, Constants.DOMAIN_ENTITY_NAME) - .put(EntityType.NOTEBOOK, Constants.NOTEBOOK_ENTITY_NAME) + .put(EntityType.INGESTION_SOURCE, Constants.INGESTION_SOURCE_ENTITY_NAME) + .put(EntityType.EXECUTION_REQUEST, Constants.EXECUTION_REQUEST_ENTITY_NAME) + .put(EntityType.ASSERTION, Constants.ASSERTION_ENTITY_NAME) + .put(EntityType.DATA_PROCESS_INSTANCE, Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME) .put(EntityType.DATA_PLATFORM_INSTANCE, Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME) + .put(EntityType.ACCESS_TOKEN, Constants.ACCESS_TOKEN_ENTITY_NAME) .put(EntityType.TEST, Constants.TEST_ENTITY_NAME) - .put(EntityType.ER_MODEL_RELATIONSHIP, Constants.ER_MODEL_RELATIONSHIP_ENTITY_NAME) + .put(EntityType.DATAHUB_POLICY, Constants.POLICY_ENTITY_NAME) + .put(EntityType.DATAHUB_ROLE, Constants.DATAHUB_ROLE_ENTITY_NAME) + .put(EntityType.POST, Constants.POST_ENTITY_NAME) + .put(EntityType.SCHEMA_FIELD, Constants.SCHEMA_FIELD_ENTITY_NAME) .put(EntityType.DATAHUB_VIEW, Constants.DATAHUB_VIEW_ENTITY_NAME) + .put(EntityType.QUERY, Constants.QUERY_ENTITY_NAME) .put(EntityType.DATA_PRODUCT, Constants.DATA_PRODUCT_ENTITY_NAME) - .put(EntityType.SCHEMA_FIELD, Constants.SCHEMA_FIELD_ENTITY_NAME) + .put(EntityType.CUSTOM_OWNERSHIP_TYPE, Constants.OWNERSHIP_TYPE_ENTITY_NAME) + .put(EntityType.INCIDENT, Constants.INCIDENT_ENTITY_NAME) + .put(EntityType.ROLE, Constants.ROLE_ENTITY_NAME) .put(EntityType.STRUCTURED_PROPERTY, Constants.STRUCTURED_PROPERTY_ENTITY_NAME) - .put(EntityType.ASSERTION, Constants.ASSERTION_ENTITY_NAME) + .put(EntityType.FORM, Constants.FORM_ENTITY_NAME) + .put(EntityType.DATA_TYPE, Constants.DATA_TYPE_ENTITY_NAME) + .put(EntityType.ENTITY_TYPE, Constants.ENTITY_TYPE_ENTITY_NAME) .put(EntityType.RESTRICTED, Constants.RESTRICTED_ENTITY_NAME) .put(EntityType.BUSINESS_ATTRIBUTE, Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME) - .put(EntityType.QUERY, Constants.QUERY_ENTITY_NAME) - .put(EntityType.POST, Constants.POST_ENTITY_NAME) - .put(EntityType.FORM, Constants.FORM_ENTITY_NAME) .build(); private static final Map ENTITY_NAME_TO_TYPE = diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java index 9e9bf86e5fe7f..334faf753cb8b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeUrnMapper.java @@ -20,34 +20,63 @@ public class EntityTypeUrnMapper { static final Map ENTITY_NAME_TO_ENTITY_TYPE_URN = ImmutableMap.builder() + .put(Constants.DOMAIN_ENTITY_NAME, "urn:li:entityType:datahub.domain") .put(Constants.DATASET_ENTITY_NAME, "urn:li:entityType:datahub.dataset") - .put(Constants.ROLE_ENTITY_NAME, "urn:li:entityType:datahub.role") .put(Constants.CORP_USER_ENTITY_NAME, "urn:li:entityType:datahub.corpuser") .put(Constants.CORP_GROUP_ENTITY_NAME, "urn:li:entityType:datahub.corpGroup") .put(Constants.DATA_PLATFORM_ENTITY_NAME, "urn:li:entityType:datahub.dataPlatform") + .put( + Constants.ER_MODEL_RELATIONSHIP_ENTITY_NAME, + "urn:li:entityType:datahub.erModelRelationship") .put(Constants.DASHBOARD_ENTITY_NAME, "urn:li:entityType:datahub.dashboard") + .put(Constants.NOTEBOOK_ENTITY_NAME, "urn:li:entityType:datahub.notebook") .put(Constants.CHART_ENTITY_NAME, "urn:li:entityType:datahub.chart") - .put(Constants.TAG_ENTITY_NAME, "urn:li:entityType:datahub.tag") .put(Constants.DATA_FLOW_ENTITY_NAME, "urn:li:entityType:datahub.dataFlow") .put(Constants.DATA_JOB_ENTITY_NAME, "urn:li:entityType:datahub.dataJob") + .put(Constants.TAG_ENTITY_NAME, "urn:li:entityType:datahub.tag") .put(Constants.GLOSSARY_TERM_ENTITY_NAME, "urn:li:entityType:datahub.glossaryTerm") .put(Constants.GLOSSARY_NODE_ENTITY_NAME, "urn:li:entityType:datahub.glossaryNode") + .put(Constants.CONTAINER_ENTITY_NAME, "urn:li:entityType:datahub.container") .put(Constants.ML_MODEL_ENTITY_NAME, "urn:li:entityType:datahub.mlModel") .put(Constants.ML_MODEL_GROUP_ENTITY_NAME, "urn:li:entityType:datahub.mlModelGroup") .put(Constants.ML_FEATURE_TABLE_ENTITY_NAME, "urn:li:entityType:datahub.mlFeatureTable") .put(Constants.ML_FEATURE_ENTITY_NAME, "urn:li:entityType:datahub.mlFeature") .put(Constants.ML_PRIMARY_KEY_ENTITY_NAME, "urn:li:entityType:datahub.mlPrimaryKey") - .put(Constants.CONTAINER_ENTITY_NAME, "urn:li:entityType:datahub.container") - .put(Constants.DOMAIN_ENTITY_NAME, "urn:li:entityType:datahub.domain") - .put(Constants.NOTEBOOK_ENTITY_NAME, "urn:li:entityType:datahub.notebook") + .put( + Constants.INGESTION_SOURCE_ENTITY_NAME, + "urn:li:entityType:datahub.dataHubIngestionSource") + .put( + Constants.EXECUTION_REQUEST_ENTITY_NAME, + "urn:li:entityType:datahub.dataHubExecutionRequest") + .put(Constants.ASSERTION_ENTITY_NAME, "urn:li:entityType:datahub.assertion") + .put( + Constants.DATA_PROCESS_INSTANCE_ENTITY_NAME, + "urn:li:entityType:datahub.dataProcessInstance") .put( Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME, "urn:li:entityType:datahub.dataPlatformInstance") + .put(Constants.ACCESS_TOKEN_ENTITY_NAME, "urn:li:entityType:datahub.dataHubAccessToken") .put(Constants.TEST_ENTITY_NAME, "urn:li:entityType:datahub.test") + .put(Constants.POLICY_ENTITY_NAME, "urn:li:entityType:datahub.dataHubPolicy") + .put(Constants.DATAHUB_ROLE_ENTITY_NAME, "urn:li:entityType:datahub.dataHubRole") + .put(Constants.POST_ENTITY_NAME, "urn:li:entityType:datahub.post") + .put(Constants.SCHEMA_FIELD_ENTITY_NAME, "urn:li:entityType:datahub.schemaField") .put(Constants.DATAHUB_VIEW_ENTITY_NAME, "urn:li:entityType:datahub.dataHubView") + .put(Constants.QUERY_ENTITY_NAME, "urn:li:entityType:datahub.query") .put(Constants.DATA_PRODUCT_ENTITY_NAME, "urn:li:entityType:datahub.dataProduct") - .put(Constants.ASSERTION_ENTITY_NAME, "urn:li:entityType:datahub.assertion") - .put(Constants.SCHEMA_FIELD_ENTITY_NAME, "urn:li:entityType:datahub.schemaField") + .put(Constants.OWNERSHIP_TYPE_ENTITY_NAME, "urn:li:entityType:datahub.ownershipType") + .put(Constants.INCIDENT_ENTITY_NAME, "urn:li:entityType:datahub.incident") + .put(Constants.ROLE_ENTITY_NAME, "urn:li:entityType:datahub.role") + .put( + Constants.STRUCTURED_PROPERTY_ENTITY_NAME, + "urn:li:entityType:datahub.structuredProperty") + .put(Constants.FORM_ENTITY_NAME, "urn:li:entityType:datahub.form") + .put(Constants.DATA_TYPE_ENTITY_NAME, "urn:li:entityType:datahub.dataType") + .put(Constants.ENTITY_TYPE_ENTITY_NAME, "urn:li:entityType:datahub.entityType") + .put(Constants.RESTRICTED_ENTITY_NAME, "urn:li:entityType:datahub.restricted") + .put( + Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME, + "urn:li:entityType:datahub.businessAttribute") .build(); private static final Map ENTITY_TYPE_URN_TO_NAME = From 3a38415d6b1497f439cb8fffa2b69032e35cf04a Mon Sep 17 00:00:00 2001 From: jayasimhankv <145704974+jayasimhankv@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:02:17 -0500 Subject: [PATCH 11/12] feat(ui): Changes to allow editable dataset name (#10608) Co-authored-by: Jay Kadambi --- .../graphql/featureflags/FeatureFlags.java | 1 + .../resolvers/config/AppConfigResolver.java | 1 + .../resolvers/mutate/UpdateNameResolver.java | 36 +++++++++++++++++++ .../types/dataset/mappers/DatasetMapper.java | 4 +++ .../mappers/DatasetUpdateInputMapper.java | 9 +++-- .../src/main/resources/app.graphql | 5 +++ .../src/main/resources/entity.graphql | 9 +++++ .../src/app/entity/dataset/DatasetEntity.tsx | 7 ++-- .../profile/header/EntityHeader.tsx | 10 ++++-- datahub-web-react/src/app/useAppConfig.ts | 5 +++ datahub-web-react/src/appConfigContext.tsx | 1 + datahub-web-react/src/graphql/app.graphql | 1 + datahub-web-react/src/graphql/browse.graphql | 1 + .../src/graphql/fragments.graphql | 1 + datahub-web-react/src/graphql/preview.graphql | 1 + datahub-web-react/src/graphql/search.graphql | 1 + .../dataset/EditableDatasetProperties.pdl | 9 +++++ ...com.linkedin.entity.entities.snapshot.json | 9 +++++ ...m.linkedin.platform.platform.snapshot.json | 9 +++++ 19 files changed, 113 insertions(+), 7 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java index 85a2c09ed79a7..167515a13c4da 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java @@ -21,5 +21,6 @@ public class FeatureFlags { private boolean schemaFieldEntityFetchEnabled = false; private boolean businessAttributeEntityEnabled = false; private boolean dataContractsEnabled = false; + private boolean editableDatasetNameEnabled = false; private boolean showSeparateSiblings = false; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index fb1672d54dc97..259d05c631557 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -186,6 +186,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen .setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled()) .setPlatformBrowseV2(_featureFlags.isPlatformBrowseV2()) .setDataContractsEnabled(_featureFlags.isDataContractsEnabled()) + .setEditableDatasetNameEnabled(_featureFlags.isEditableDatasetNameEnabled()) .setShowSeparateSiblings(_featureFlags.isShowSeparateSiblings()) .build(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java index 1d90720fc6902..ad6dbbe635ed1 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/UpdateNameResolver.java @@ -4,9 +4,11 @@ import static com.linkedin.datahub.graphql.resolvers.mutate.MutationUtils.persistAspect; import com.linkedin.businessattribute.BusinessAttributeInfo; +import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.CorpuserUrn; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.SetMode; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.concurrency.GraphQLConcurrencyUtils; @@ -20,6 +22,7 @@ import com.linkedin.datahub.graphql.resolvers.mutate.util.DomainUtils; import com.linkedin.datahub.graphql.resolvers.mutate.util.GlossaryUtils; import com.linkedin.dataproduct.DataProductProperties; +import com.linkedin.dataset.EditableDatasetProperties; import com.linkedin.domain.DomainProperties; import com.linkedin.domain.Domains; import com.linkedin.entity.client.EntityClient; @@ -70,6 +73,8 @@ public CompletableFuture get(DataFetchingEnvironment environment) throw return updateDataProductName(targetUrn, input, context); case Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME: return updateBusinessAttributeName(targetUrn, input, environment.getContext()); + case Constants.DATASET_ENTITY_NAME: + return updateDatasetName(targetUrn, input, environment.getContext()); default: throw new RuntimeException( String.format( @@ -236,6 +241,37 @@ private Boolean updateGroupName(Urn targetUrn, UpdateNameInput input, QueryConte "Unauthorized to perform this action. Please contact your DataHub administrator."); } + // udpates editable dataset properties aspect's name field + private Boolean updateDatasetName(Urn targetUrn, UpdateNameInput input, QueryContext context) { + if (AuthorizationUtils.canEditProperties(targetUrn, context)) { + try { + if (input.getName() != null) { + final EditableDatasetProperties editableDatasetProperties = + new EditableDatasetProperties(); + editableDatasetProperties.setName(input.getName()); + final AuditStamp auditStamp = new AuditStamp(); + Urn actor = UrnUtils.getUrn(context.getActorUrn()); + auditStamp.setActor(actor, SetMode.IGNORE_NULL); + auditStamp.setTime(System.currentTimeMillis()); + editableDatasetProperties.setLastModified(auditStamp); + persistAspect( + context.getOperationContext(), + targetUrn, + Constants.EDITABLE_DATASET_PROPERTIES_ASPECT_NAME, + editableDatasetProperties, + actor, + _entityService); + } + return true; + } catch (Exception e) { + throw new RuntimeException( + String.format("Failed to perform update against input %s", input), e); + } + } + throw new AuthorizationException( + "Unauthorized to perform this action. Please contact your DataHub administrator."); + } + private Boolean updateDataProductName( Urn targetUrn, UpdateNameInput input, QueryContext context) { try { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetMapper.java index 89d5aa8621bf0..a7b5f6de0c183 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetMapper.java @@ -222,6 +222,7 @@ private void mapDatasetProperties( properties.setQualifiedName(gmsProperties.getQualifiedName()); dataset.setProperties(properties); dataset.setDescription(properties.getDescription()); + dataset.setName(properties.getName()); if (gmsProperties.getUri() != null) { dataset.setUri(gmsProperties.getUri().toString()); } @@ -248,6 +249,9 @@ private void mapEditableDatasetProperties(@Nonnull Dataset dataset, @Nonnull Dat new EditableDatasetProperties(dataMap); final DatasetEditableProperties editableProperties = new DatasetEditableProperties(); editableProperties.setDescription(editableDatasetProperties.getDescription()); + if (editableDatasetProperties.getName() != null) { + editableProperties.setName(editableDatasetProperties.getName()); + } dataset.setEditableProperties(editableProperties); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetUpdateInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetUpdateInputMapper.java index 122298bcab654..104dc0e104341 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetUpdateInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/DatasetUpdateInputMapper.java @@ -111,8 +111,13 @@ public Collection apply( if (datasetUpdateInput.getEditableProperties() != null) { final EditableDatasetProperties editableDatasetProperties = new EditableDatasetProperties(); - editableDatasetProperties.setDescription( - datasetUpdateInput.getEditableProperties().getDescription()); + if (datasetUpdateInput.getEditableProperties().getDescription() != null) { + editableDatasetProperties.setDescription( + datasetUpdateInput.getEditableProperties().getDescription()); + } + if (datasetUpdateInput.getEditableProperties().getName() != null) { + editableDatasetProperties.setName(datasetUpdateInput.getEditableProperties().getName()); + } editableDatasetProperties.setLastModified(auditStamp); editableDatasetProperties.setCreated(auditStamp); proposals.add( diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 024a7a989f9db..262d2384d84ad 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -508,6 +508,11 @@ type FeatureFlagsConfig { """ dataContractsEnabled: Boolean! + """ + Whether dataset names are editable + """ + editableDatasetNameEnabled: Boolean! + """ If turned on, all siblings will be separated with no way to get to a "combined" sibling view """ diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 941a6a28ceb2c..609597beee51b 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -3482,6 +3482,11 @@ type DatasetEditableProperties { Description of the Dataset """ description: String + + """ + Editable name of the Dataset + """ + name: String } """ @@ -4850,6 +4855,10 @@ input DatasetEditablePropertiesUpdate { Writable description aka documentation for a Dataset """ description: String! + """ + Editable name of the Dataset + """ + name: String } """ diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index c30fee7abc0b6..21ae085832cb3 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -220,6 +220,7 @@ export class DatasetEntity implements Entity { }, ]} sidebarSections={this.getSidebarSections()} + isNameEditable /> ); @@ -283,7 +284,7 @@ export class DatasetEntity implements Entity { return ( { return ( { }; displayName = (data: Dataset) => { - return data?.properties?.name || data.name || data.urn; + return data?.editableProperties?.name || data?.properties?.name || data.name || data.urn; }; platformLogoUrl = (data: Dataset) => { diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx index 09fa23dbc9f57..11335d0378760 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx @@ -17,6 +17,7 @@ import { capitalizeFirstLetterOnly } from '../../../../../shared/textUtil'; import { useUserContext } from '../../../../../context/useUserContext'; import { useEntityRegistry } from '../../../../../useEntityRegistry'; import EntityHeaderLoadingSection from './EntityHeaderLoadingSection'; +import { useIsEditableDatasetNameEnabled } from '../../../../../useAppConfig'; const TitleWrapper = styled.div` display: flex; @@ -71,6 +72,8 @@ export function getCanEditName( return true; // TODO: add permissions for data products case EntityType.BusinessAttribute: return privileges?.manageBusinessAttributes; + case EntityType.Dataset: + return entityData?.privileges?.canEditProperties; default: return false; } @@ -94,8 +97,11 @@ export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEdi const entityName = entityData?.name; const subType = capitalizeFirstLetterOnly(entityData?.subTypes?.typeNames?.[0]) || undefined; + const isEditableDatasetNameEnabled = useIsEditableDatasetNameEnabled(); const canEditName = - isNameEditable && getCanEditName(entityType, entityData, me?.platformPrivileges as PlatformPrivileges); + isEditableDatasetNameEnabled && + isNameEditable && + getCanEditName(entityType, entityData, me?.platformPrivileges as PlatformPrivileges); const entityRegistry = useEntityRegistry(); return ( @@ -106,7 +112,7 @@ export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEdi <> - + {entityData?.deprecation?.deprecated && ( Date: Sat, 10 Aug 2024 01:18:11 +0900 Subject: [PATCH 12/12] fix: remove saxo (#11127) --- README.md | 1 - docs-website/adoptionStoriesIndexes.json | 11 ----- .../src/pages/_components/Logos/index.js | 42 +++++++++--------- .../logos/scrollingCompanies/saxo_bank.webp | Bin 3592 -> 0 bytes 4 files changed, 21 insertions(+), 33 deletions(-) delete mode 100644 docs-website/static/img/logos/scrollingCompanies/saxo_bank.webp diff --git a/README.md b/README.md index b3c2e2d545941..3ac0668918f70 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,6 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [Peloton](https://www.onepeloton.com) - [PITS Global Data Recovery Services](https://www.pitsdatarecovery.net/) - [Razer](https://www.razer.com) -- [Saxo Bank](https://www.home.saxo) - [Showroomprive](https://www.showroomprive.com/) - [SpotHero](https://spothero.com) - [Stash](https://www.stash.com) diff --git a/docs-website/adoptionStoriesIndexes.json b/docs-website/adoptionStoriesIndexes.json index 3fe666ccf1c13..9697bdfcf39a9 100644 --- a/docs-website/adoptionStoriesIndexes.json +++ b/docs-website/adoptionStoriesIndexes.json @@ -77,17 +77,6 @@ "category": "B2B & B2C", "description": "“We looked around for data catalog tool, and DataHub was a clear winner.”

Zynga levels up data management using DataHub, highlighting its role in enhancing data management, tracing data lineage, and ensuring data quality." }, - { - "name": "Saxo Bank", - "slug": "saxo-bank", - "imageUrl": "/img/logos/companies/saxobank.svg", - "imageSize": "default", - "link": "https://blog.datahubproject.io/enabling-data-discovery-in-a-data-mesh-the-saxo-journey-451b06969c8f", - "linkType": "blog", - "tagline": "Enabling Data Discovery in a Data Mesh", - "category": "Financial & Fintech", - "description": "Saxo Bank adopted DataHub to enhance data quality and streamline governance, facilitating efficient data management through self-service capabilities.

By integrating Apache Kafka and Snowflake with DataHub, the bank embraced Data Mesh principles to democratize data, support rapid growth, and improve business processes." - }, { "name": "MediaMarkt Saturn", "slug": "mediamarkt-saturn", diff --git a/docs-website/src/pages/_components/Logos/index.js b/docs-website/src/pages/_components/Logos/index.js index 565f6e9a46fee..b17c072d02d57 100644 --- a/docs-website/src/pages/_components/Logos/index.js +++ b/docs-website/src/pages/_components/Logos/index.js @@ -168,36 +168,36 @@ export const CompanyLogos = () => ( modules={[Pagination]} className={clsx("mySwiper", styles.companyWrapper)} > - {companies.map((company, idx) => ( - - {company.link ? ( - + {companies + .filter((company) => company.imageUrl) // Filter companies with imageUrl + .map((company, idx) => ( + + {company.link ? ( + + {company.name} + + ) : ( {company.name} - - ) : ( - {company.name} - )} - - ))} + )} + + ))} ); diff --git a/docs-website/static/img/logos/scrollingCompanies/saxo_bank.webp b/docs-website/static/img/logos/scrollingCompanies/saxo_bank.webp deleted file mode 100644 index a4c1aae73fe48b88946801f5713f15b10df3c177..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3592 zcmeHEc|6qZ7XPUz>)0y0y75L4Dq8Hs7)#lg7(&Q0b|Yd)*0NPHcIgcfrbcG$%eXIU z>@#*o$U7LcV5Aw`pZeU}`^S6lfA_!6bIy6b&pF@E^PJ}#bC|w9zykm_dOD`orWb8F z002+~trP&N0RX0}Yud#E9s!oFJ%0d9A$)@a&7nHSZ0)WeJNO=80oQkNb`A3Xf&SKm zEc);dE=m4x8UH)Te$y?;6{LIuO)3z49IQzY#DX3_u*@EI`GM8;aA>f9Fvv08!-1CO zx*)y@Vi}LWV3)sOSO360elo~WL->U3y|pKMzvFWAwXy*BL!gNPL4Y{`1$6fQ51v8s zdjkU--UuCf8)?fqn( z-?&Ljushnl)>z>lZFjc|t<)(dFg zf&X&{-hXDERltY9|D>gP3e7D>XcYEQKk9v@23=9!Xv0Mp+{dbIN7!eEG(~o!4^`av zbsPzS|4^#ry>-5UDG8%2$Lq^RkfAqkqDvl)D&S?1C;FA0*I#*PN)MXozVm5=drkAJ zGY8t%a-u6AOidJ7=shoN;6HsRYMv~hJP>Mjhy6>$x!$B%POYd)Z$8p+hZc7E1Ul54w(=i6P4Undr)n@#D`fC{(?T zTQPy3F0W0|5Qj91dx<+-va7wndP!-leU0sQ((Khp)ozEE__;oH$fUV@SJOCMXlA)A-GFhl2-KWk35!2 za;H7NScB>+X@LOU*ZqugRI)!g9deiAr)ZoJt=e`UQiV2!yw7ZIAsbs@OqioJ&SICE?46IRxL(R-y~+&_dR=)1LL!dNYr|4p|_z zcM57!7cdeXj@4ol-pTQ*zdJUB9`u3fNYKk|4Y-CScEwlDk$P{%Zsye4*|5MS6&z2- z=Ty)$($vF;?8jBCRhz1?Mky+k1LX*~n?Pl~C346y&Bsy5ii-#3ddgx@f#lTijqDp(+#Ijb|;wM3*_0%0BAEb>KRehP3j9-S2%Hkxy}gDPAS5P0UB^ zY@H4D+AaF5WAAW2dN7e3u%tZ<#e683*6?i_{LODopDpBx$Q#R}9os#XAwJ$M=vCo7 z!BnfJuuqFmW{aDjWA0bHUa?e#Ray#pq;j7prG>{b3fLGbHAiTqGYSvH|L9*Et{Fk! zovf$!Py}UdTvQ0Mh@Lc;!qFMGxnz`7Z!6?&{?&Rz)@!U&^GXVO@v6+ck(a9vDhM(R zEO}ny$|%qDsZcbrI0M}ieG1|CI`vk0eBFsnI;qRxJ!2kGK*iJM8Llj(^CqCWk<+9T zE!Z9TEUe?;RcmT}36s01o#csjUB*EP2?-V6%_GGT@dEKyk`r07IP;!QOZF<*>s3YD z*PEd?M4!9nVAU=>Z6}urhavI<7w~D&ZjD|@g7Mh-4&^8{Io`y6&fajr^`93}pI+t$T{^@es8 zw@J`cspiPM(fVkB*K>`jhgE1|V{gh>3;Yb47-X0PTj@AlRyT>st=r^L*S_=315QO_ zAj+P9lrRS3;gb+zmaYjYgrhK~s(A=m zp@IkCN{7UV}r&iw-9VL zaVHsuR!Dxs7`!-rz_D382os-Ql6g^T@~U*_*vwOU^YX11xjG4pLOKcQ6rp!So*JlG zpHsWjWN;{CT%FeJr+SUFSf7x5Y$Hgl@yUXavm|-8P8!R)Ktd}VjNc5|+MdA^H7>}R zWpyYQEp9D(iXIuBrGJ%0UV{84E7jc^YQRlFPG(cvKH(B)FckT1wJfeUV%V#7xs3?p z!@2F*2!ikpF-X}P^B#pZ6ps4x#H&M#iR*mtSWan0Mn!3|YF9-??yO0$O|^ZQo`xKl d;llDM>8QJxp`)}5Z|-22yB*Ay-Ca=u;9raMP3Qms