Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest/databricks): use latest report message format for warning messages #11319

Merged
merged 12 commits into from
Sep 11, 2024
116 changes: 87 additions & 29 deletions metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,9 @@ def _is_single_row_query_method(query: Any) -> bool:
if frame.name in SINGLE_ROW_QUERY_METHODS:
return True
if frame.name in CONSTANT_ROW_QUERY_METHODS:
# TODO: figure out how to handle these.
# A cross join will return (`constant` ** `queries`) rows rather
# than `constant` rows with `queries` columns.
# See https://stackoverflow.com/questions/35638753/create-query-to-join-2-tables-1-on-1-with-nothing-in-common.
# TODO: figure out how to handle these. A cross join will return (`constant` ** `queries`) rows rather
# than `constant` rows with `queries` columns. See
# https://stackoverflow.com/questions/35638753/create-query-to-join-2-tables-1-on-1-with-nothing-in-common.
return False

if frame.name == COLUMN_MAP_QUERY_METHOD:
Expand Down Expand Up @@ -429,9 +428,12 @@ def _get_column_cardinality(
logger.debug(
f"Caught exception while attempting to get column cardinality for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column cardinality",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Cardinality",
message="The cardinality for the column will not be accessible",
context=f"{self.dataset_name}.{column}",
exc=e,
)
return

Expand Down Expand Up @@ -484,14 +486,15 @@ def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None:
self.dataset.engine.execute(get_estimate_script).scalar()
)
else:
# If the configuration is not set to 'estimate only' mode, we directly obtain the row count from the dataset.
# However, if an offset or limit is set, we need to adjust how we calculate the row count.
# This is because applying a limit or offset could potentially skew the row count.
# For instance, if a limit is set and the actual row count exceeds this limit,
# the returned row count would incorrectly be the limit value.
# If the configuration is not set to 'estimate only' mode, we directly obtain the row count from the
# dataset. However, if an offset or limit is set, we need to adjust how we calculate the row count. This
# is because applying a limit or offset could potentially skew the row count. For instance, if a limit is
# set and the actual row count exceeds this limit, the returned row count would incorrectly be the limit
# value.
#
# To address this, if a limit is set, we use the original table name when calculating the row count.
# This ensures that the row count is based on the original table, not on a view which have limit or offset applied.
# To address this, if a limit is set, we use the original table name when calculating the row count. This
# ensures that the row count is based on the original table, not on a view which have limit or offset
# applied.
if (self.config.limit or self.config.offset) and not self.custom_sql:
# We don't want limit and offset to get applied to the row count
# This is kinda hacky way to do it, but every other way would require major refactoring
Expand All @@ -513,9 +516,16 @@ def _get_dataset_column_min(
logger.debug(
f"Caught exception while attempting to get column min for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column min",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Min",
message="The min for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
sid-acryl marked this conversation as resolved.
Show resolved Hide resolved
""",
exc=e,
)

@_run_with_query_combiner
Expand All @@ -530,9 +540,16 @@ def _get_dataset_column_max(
logger.debug(
f"Caught exception while attempting to get column max for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column max",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Max",
message="The max for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand All @@ -547,9 +564,16 @@ def _get_dataset_column_mean(
logger.debug(
f"Caught exception while attempting to get column mean for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column mean",
f"{self.dataset_name}.{column}",
title="Profiling: to Calculate Mean",
sid-acryl marked this conversation as resolved.
Show resolved Hide resolved
message="The mean for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand Down Expand Up @@ -581,9 +605,16 @@ def _get_dataset_column_median(
logger.debug(
f"Caught exception while attempting to get column median for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column medians",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Medians",
message="The medians for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand All @@ -599,8 +630,14 @@ def _get_dataset_column_stdev(
f"Caught exception while attempting to get column stddev for column {column}. {e}"
)
self.report.report_warning(
"Profiling - Unable to get column stddev",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Standard Deviation",
message="The standard deviation for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand Down Expand Up @@ -638,9 +675,16 @@ def _get_dataset_column_quantiles(
logger.debug(
f"Caught exception while attempting to get column quantiles for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column quantiles",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Quantiles",
message="The quantiles for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand Down Expand Up @@ -682,9 +726,16 @@ def _get_dataset_column_histogram(
logger.debug(
f"Caught exception while attempting to get column histogram for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column histogram",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Histogram",
message="The histogram for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

@_run_with_query_combiner
Expand Down Expand Up @@ -714,9 +765,16 @@ def _get_dataset_column_sample_values(
logger.debug(
f"Caught exception while attempting to get sample values for column {column}. {e}"
)

self.report.report_warning(
"Profiling - Unable to get column sample values",
f"{self.dataset_name}.{column}",
title="Profiling: Unable to Calculate Sample Values",
message="The sample values for the column will not be accessible",
context=f"""
{{
"column": "{self.dataset_name}.{column}"
}}
""",
exc=e,
)

def generate_dataset_profile( # noqa: C901 (complexity)
Expand Down
Loading