From de8250990200e1b2a2a981c4803c89ed071469e2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 10 Dec 2024 10:57:32 +0100 Subject: [PATCH 001/129] Set LineageAtom.other to dict by default --- src/databricks/labs/ucx/source_code/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index d02dbcdb70..599fbbf6bb 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -178,7 +178,7 @@ class LineageAtom: object_type: str object_id: str - other: dict[str, str] | None = None + other: dict[str, str] = field(default_factory=dict) @dataclass From daa03940801d594c4685b88ef89fe7612dd70940 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 10 Dec 2024 11:57:38 +0100 Subject: [PATCH 002/129] Add dashboard progress encoder --- .../labs/ucx/progress/dashboards.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 src/databricks/labs/ucx/progress/dashboards.py diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py new file mode 100644 index 0000000000..a09a541464 --- /dev/null +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -0,0 +1,89 @@ +import collections +from dataclasses import replace +from functools import cached_property + +from databricks.labs.lsql.backends import SqlBackend + +from databricks.labs.ucx.assessment.dashboards import Dashboard, DashboardOwnership +from databricks.labs.ucx.progress.history import ProgressEncoder +from databricks.labs.ucx.progress.install import Historical +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.queries import QueryProblem + + +class DashboardProgressEncoder(ProgressEncoder[Dashboard]): + """Encoder class:Dashboard to class:History.""" + + def __init__( + self, + sql_backend: SqlBackend, + ownership: DashboardOwnership, + direct_fs_access_crawlers: list[DirectFsAccessCrawler], + inventory_database: str, + run_id: int, + workspace_id: int, + catalog: str, + ) -> None: + super().__init__( + sql_backend, + ownership, + Dashboard, + run_id, + workspace_id, + catalog, + "multiworkspace", + "historical", + ) + self._inventory_database = inventory_database + self._direct_fs_access_crawlers = direct_fs_access_crawlers + + @cached_property + def _query_problems(self) -> dict[str, list[str]]: + index = collections.defaultdict(list) + for row in self._sql_backend.fetch( + 'SELECT * FROM query_problems', + catalog='hive_metastore', + schema=self._inventory_database, + ): + problem = QueryProblem(**row.asDict()) + failure = ( + f'[{problem.code}] {problem.query_name} ({problem.dashboard_id}/{problem.query_id}) : {problem.message}' + ) + index[problem.dashboard_id].append(failure) + return index + + @cached_property + def _direct_fs_accesses(self) -> dict[str, list[str]]: + index = collections.defaultdict(list) + for crawler in self._direct_fs_access_crawlers: + for direct_fs_access in crawler.snapshot(): + # The dashboard and query source lineage are added by the QueryLinter + if len(direct_fs_access.source_lineage) < 2: + continue + if direct_fs_access.source_lineage[0].object_type != "DASHBOARD": # Note: this skips dangling queries + continue + if direct_fs_access.source_lineage[1].object_type != "QUERY": + continue + dashboard_id = direct_fs_access.source_lineage[0].object_id + query_id = direct_fs_access.source_lineage[1].object_id # / + query_name = direct_fs_access.source_lineage[1].other.get("name", "UNKNOWN") + # Follow same failure message structure as the QueryProblem above and DirectFsAccessPyLinter deprecation + code = "direct-filesystem-access" + message = f"The use of direct filesystem references is deprecated: {direct_fs_access.path}" + failure = f"[{code}] {query_name} ({query_id}) : {message}" + index[dashboard_id].append(failure) + return index + + def _encode_record_as_historical(self, record: Dashboard) -> Historical: + """Encode a dashboard as a historical records. + + Failures are detected by the QueryLinter: + - Query problems + - Direct filesystem access by code used in query + """ + historical = super()._encode_record_as_historical(record) + failures = [] + failures.extend(self._query_problems.get(record.id, [])) + failures.extend(self._direct_fs_accesses.get(record.id, [])) + # TODO: Add UsedTable + return replace(historical, failures=historical.failures + failures) From 07f608c975e9bc983aa064be554aff394b15bc59 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 10 Dec 2024 14:05:09 +0100 Subject: [PATCH 003/129] Get table failures from historical table snapshot --- .../labs/ucx/hive_metastore/tables.py | 23 +++++++++++ .../labs/ucx/progress/dashboards.py | 38 ++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 0bfba33493..c96106bd80 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import re import typing @@ -86,6 +88,27 @@ def __post_init__(self) -> None: if isinstance(self.table_format, str): # Should not happen according to type hint, still safer self.table_format = self.table_format.upper() + @classmethod + def from_historical_data(cls, data: dict[str, str]) -> Table: + kwargs: dict[str, str | bool] = { + "catalog": data["catalog"], + "database": data["database"], + "name": data["name"], + "table_format": data["table_format"], + "location": data["table_format"], + } + if "location" in data: + kwargs["location"] = data["location"] + if "view_text" in data: + kwargs["view_text"] = data["view_text"] + if "upgraded_to" in data: + kwargs["upgraded_to"] = data["upgraded_to"] + if "storage_properties" in data: + kwargs["storage_properties"] = data["storage_properties"] + if "is_partitioned" in data: + kwargs["is_partitioned"] = bool(data["is_partitioned"]) + return cls(**kwargs) # type: ignore + @property def is_delta(self) -> bool: if self.table_format is None: diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index a09a541464..978bb08216 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -5,10 +5,13 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.labs.ucx.assessment.dashboards import Dashboard, DashboardOwnership +from databricks.labs.ucx.hive_metastore.tables import Table from databricks.labs.ucx.progress.history import ProgressEncoder from databricks.labs.ucx.progress.install import Historical +from databricks.labs.ucx.source_code.base import UsedTable from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.queries import QueryProblem +from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler class DashboardProgressEncoder(ProgressEncoder[Dashboard]): @@ -19,6 +22,7 @@ def __init__( sql_backend: SqlBackend, ownership: DashboardOwnership, direct_fs_access_crawlers: list[DirectFsAccessCrawler], + used_tables_crawlers: list[UsedTablesCrawler], inventory_database: str, run_id: int, workspace_id: int, @@ -36,6 +40,7 @@ def __init__( ) self._inventory_database = inventory_database self._direct_fs_access_crawlers = direct_fs_access_crawlers + self._used_tables_crawlers = used_tables_crawlers @cached_property def _query_problems(self) -> dict[str, list[str]]: @@ -74,6 +79,37 @@ def _direct_fs_accesses(self) -> dict[str, list[str]]: index[dashboard_id].append(failure) return index + @cached_property + def _used_tables(self) -> dict[str, list[UsedTable]]: + index = collections.defaultdict(list) + for crawler in self._used_tables_crawlers: + for used_table in crawler.snapshot(): + # The dashboard and query source lineage are added by the QueryLinter + if len(used_table.source_lineage) < 2: + continue + if used_table.source_lineage[0].object_type != "DASHBOARD": # Note: this skips dangling queries + continue + if used_table.source_lineage[1].object_type != "QUERY": + continue + dashboard_id = used_table.source_lineage[0].object_id + index[dashboard_id].append(used_table) + return index + + @cached_property + def _tables_failures(self) -> dict[str, list[str]]: + table_failures = {} + for row in self._sql_backend.fetch( + f"SELECT * FROM `{self._catalog}`.`{self._schema}`.`objects_snapshot` WHERE object_type = 'Table'" + ): + historical = Historical(**row.asDict()) + table = Table.from_historical_data(historical.data) + table_failures[table.full_name] = historical.failures + index = collections.defaultdict(list) + for dashboard_id, used_tables in self._used_tables.items(): + for used_table in used_tables: + index[dashboard_id].extend(table_failures.get(used_table.full_name, [])) + return index + def _encode_record_as_historical(self, record: Dashboard) -> Historical: """Encode a dashboard as a historical records. @@ -85,5 +121,5 @@ def _encode_record_as_historical(self, record: Dashboard) -> Historical: failures = [] failures.extend(self._query_problems.get(record.id, [])) failures.extend(self._direct_fs_accesses.get(record.id, [])) - # TODO: Add UsedTable + failures.extend(self._tables_failures.get(record.id, [])) return replace(historical, failures=historical.failures + failures) From 0aa8b4f76f7b22028e7f4e253175f00aa19fb748 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 10 Dec 2024 14:43:20 +0100 Subject: [PATCH 004/129] Allow other to be None --- src/databricks/labs/ucx/progress/dashboards.py | 4 +++- src/databricks/labs/ucx/source_code/base.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 978bb08216..d2c551de98 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -71,7 +71,9 @@ def _direct_fs_accesses(self) -> dict[str, list[str]]: continue dashboard_id = direct_fs_access.source_lineage[0].object_id query_id = direct_fs_access.source_lineage[1].object_id # / - query_name = direct_fs_access.source_lineage[1].other.get("name", "UNKNOWN") + query_name = "UNKNOWN" + if direct_fs_access.source_lineage[1].other and "name" in direct_fs_access.source_lineage[1].other: + query_name = direct_fs_access.source_lineage[1].other["name"] # Follow same failure message structure as the QueryProblem above and DirectFsAccessPyLinter deprecation code = "direct-filesystem-access" message = f"The use of direct filesystem references is deprecated: {direct_fs_access.path}" diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 599fbbf6bb..d02dbcdb70 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -178,7 +178,7 @@ class LineageAtom: object_type: str object_id: str - other: dict[str, str] = field(default_factory=dict) + other: dict[str, str] | None = None @dataclass From cbff5714c62e43fe252e8929bc13b458af5808a0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 11:31:30 +0100 Subject: [PATCH 005/129] Remove cached properties from dashboard progress encoder --- .../labs/ucx/progress/dashboards.py | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index d2c551de98..4ba5e30d86 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -1,10 +1,12 @@ import collections +import logging +from collections.abc import Iterable from dataclasses import replace -from functools import cached_property from databricks.labs.lsql.backends import SqlBackend from databricks.labs.ucx.assessment.dashboards import Dashboard, DashboardOwnership +from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore.tables import Table from databricks.labs.ucx.progress.history import ProgressEncoder from databricks.labs.ucx.progress.install import Historical @@ -14,6 +16,12 @@ from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler +logger = logging.getLogger(__name__) + + +DashboardIdToFailuresType = dict[str, list[str]] # dict[, list[]] + + class DashboardProgressEncoder(ProgressEncoder[Dashboard]): """Encoder class:Dashboard to class:History.""" @@ -42,8 +50,19 @@ def __init__( self._direct_fs_access_crawlers = direct_fs_access_crawlers self._used_tables_crawlers = used_tables_crawlers - @cached_property - def _query_problems(self) -> dict[str, list[str]]: + def append_inventory_snapshot(self, snapshot: Iterable[Dashboard]) -> None: + query_problems = self._get_query_problems() + dfsas = self._get_direct_filesystem_accesses() + table_failures = self._get_tables_failures() + history_records = [] + for record in snapshot: + history_record = self._encode_dashboard_as_historical(record, query_problems, dfsas, table_failures) + history_records.append(history_record) + logger.debug(f"Appending {len(history_records)} {self._klass} table record(s) to history.") + # The mode is 'append'. This is documented as conflict-free. + self._sql_backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") + + def _get_query_problems(self) -> DashboardIdToFailuresType: index = collections.defaultdict(list) for row in self._sql_backend.fetch( 'SELECT * FROM query_problems', @@ -57,8 +76,7 @@ def _query_problems(self) -> dict[str, list[str]]: index[problem.dashboard_id].append(failure) return index - @cached_property - def _direct_fs_accesses(self) -> dict[str, list[str]]: + def _get_direct_filesystem_accesses(self) -> DashboardIdToFailuresType: index = collections.defaultdict(list) for crawler in self._direct_fs_access_crawlers: for direct_fs_access in crawler.snapshot(): @@ -81,8 +99,7 @@ def _direct_fs_accesses(self) -> dict[str, list[str]]: index[dashboard_id].append(failure) return index - @cached_property - def _used_tables(self) -> dict[str, list[UsedTable]]: + def _get_used_tables(self) -> dict[str, list[UsedTable]]: index = collections.defaultdict(list) for crawler in self._used_tables_crawlers: for used_table in crawler.snapshot(): @@ -97,8 +114,7 @@ def _used_tables(self) -> dict[str, list[UsedTable]]: index[dashboard_id].append(used_table) return index - @cached_property - def _tables_failures(self) -> dict[str, list[str]]: + def _get_tables_failures(self) -> DashboardIdToFailuresType: table_failures = {} for row in self._sql_backend.fetch( f"SELECT * FROM `{self._catalog}`.`{self._schema}`.`objects_snapshot` WHERE object_type = 'Table'" @@ -107,21 +123,29 @@ def _tables_failures(self) -> dict[str, list[str]]: table = Table.from_historical_data(historical.data) table_failures[table.full_name] = historical.failures index = collections.defaultdict(list) - for dashboard_id, used_tables in self._used_tables.items(): - for used_table in used_tables: + used_tables = self._get_used_tables() + for dashboard_id, used_tables_in_dashboard in used_tables.items(): + for used_table in used_tables_in_dashboard: index[dashboard_id].extend(table_failures.get(used_table.full_name, [])) return index - def _encode_record_as_historical(self, record: Dashboard) -> Historical: + def _encode_dashboard_as_historical( + self, + record: Dashboard, + query_problems: DashboardIdToFailuresType, + dfsas: DashboardIdToFailuresType, + tables_failures: DashboardIdToFailuresType, + ) -> Historical: """Encode a dashboard as a historical records. Failures are detected by the QueryLinter: - Query problems - Direct filesystem access by code used in query + - Hive metastore tables """ historical = super()._encode_record_as_historical(record) failures = [] - failures.extend(self._query_problems.get(record.id, [])) - failures.extend(self._direct_fs_accesses.get(record.id, [])) - failures.extend(self._tables_failures.get(record.id, [])) + failures.extend(query_problems.get(record.id, [])) + failures.extend(dfsas.get(record.id, [])) + failures.extend(tables_failures.get(record.id, [])) return replace(historical, failures=historical.failures + failures) From 00b32dbb471135b6a696e9800b1c8d3cea9cb4df Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 11:34:48 +0100 Subject: [PATCH 006/129] Add first integration test for dashboard progress encoder --- tests/integration/progress/test_dashboards.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tests/integration/progress/test_dashboards.py diff --git a/tests/integration/progress/test_dashboards.py b/tests/integration/progress/test_dashboards.py new file mode 100644 index 0000000000..1964784c8e --- /dev/null +++ b/tests/integration/progress/test_dashboards.py @@ -0,0 +1,20 @@ +from databricks.labs.ucx.framework.utils import escape_sql_identifier + + +def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx) -> None: + failures = [] + az_cli_ctx.progress_tracking_installation.run() + runtime_ctx = runtime_ctx.replace( + parent_run_id=1, + sql_backend=az_cli_ctx.sql_backend, + ucx_catalog=az_cli_ctx.ucx_catalog, + ) + dashboard = az_cli_ctx.make_dashboard() + + runtime_ctx.tables_progress.append_inventory_snapshot([dashboard]) + + history_table_name = escape_sql_identifier(runtime_ctx.tables_progress.full_name) + records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}")) + + assert len(records) == 1, "Expected one historical entry" + assert records[0].failures == failures From c49aa9f037e6c4465e53b1fd57f9a579a46d3700 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 11:44:20 +0100 Subject: [PATCH 007/129] Test dashboard progress encoder without failures --- tests/unit/progress/test_dashboards.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/unit/progress/test_dashboards.py diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py new file mode 100644 index 0000000000..1ec94ee4f0 --- /dev/null +++ b/tests/unit/progress/test_dashboards.py @@ -0,0 +1,35 @@ +from unittest.mock import create_autospec + +from databricks.labs.ucx.assessment.dashboards import Dashboard +from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.labs.ucx.progress.dashboards import DashboardProgressEncoder +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler + + +def test_dashboard_progress_encoder_no_failures(mock_backend) -> None: + ownership = create_autospec(Ownership) + ownership.owner_of.return_value = "user" + direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + used_tables_crawler = create_autospec(UsedTablesCrawler) + encoder = DashboardProgressEncoder( + mock_backend, + ownership, + [direct_fs_access_crawler], + [used_tables_crawler], + inventory_database="inventory", + run_id=1, + workspace_id=123456789, + catalog="test", + ) + dashboard = Dashboard("did") + + encoder.append_inventory_snapshot([dashboard]) + + rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") + assert rows, f"No rows written for: {encoder.full_name}" + assert len(rows[0].failures) == 0 + ownership.owner_of.assert_called_once() + direct_fs_access_crawler.snapshot.assert_called_once() + used_tables_crawler.snapshot.assert_called_once() From 932d1b18d96466f53df151f4a00683373fe46cfe Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 12:57:26 +0100 Subject: [PATCH 008/129] Test dashboard failure coming from query problem --- tests/unit/progress/test_dashboards.py | 66 +++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 1ec94ee4f0..014b04567e 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -1,5 +1,7 @@ from unittest.mock import create_autospec +from databricks.labs.lsql.backends import MockBackend, Row + from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -8,7 +10,23 @@ from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler -def test_dashboard_progress_encoder_no_failures(mock_backend) -> None: +def test_dashboard_progress_encoder_no_failures() -> None: + mock_backend = MockBackend( + rows={ + "SELECT \\* FROM query_problems": [ + Row( + dashboard_id="12345", # Not a match with dashboard below, hence no failure + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", + query_id="23456", + query_parent="queries/parent", + query_name="my_query", + code="sql-parse-error", + message="Could not parse SQL", + ) + ] + } + ) ownership = create_autospec(Ownership) ownership.owner_of.return_value = "user" direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) @@ -33,3 +51,49 @@ def test_dashboard_progress_encoder_no_failures(mock_backend) -> None: ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() + + +def test_dashboard_progress_encoder_query_problem_as_failure() -> None: + failures = ["[sql-parse-error] my_query (12345/23456) : Could not parse SQL"] + + mock_backend = MockBackend( + rows={ + # The Mockbackend.fetch ignores the catalog and schema keyword arguments + "SELECT \\* FROM query_problems": [ + Row( + dashboard_id="12345", + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", + query_id="23456", + query_parent="queries/parent", + query_name="my_query", + code="sql-parse-error", + message="Could not parse SQL", + ) + ] + } + ) + ownership = create_autospec(Ownership) + ownership.owner_of.return_value = "user" + direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + used_tables_crawler = create_autospec(UsedTablesCrawler) + encoder = DashboardProgressEncoder( + mock_backend, + ownership, + [direct_fs_access_crawler], + [used_tables_crawler], + inventory_database="inventory", + run_id=1, + workspace_id=123456789, + catalog="test", + ) + dashboard = Dashboard("12345") + + encoder.append_inventory_snapshot([dashboard]) + + rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") + assert rows, f"No rows written for: {encoder.full_name}" + assert rows[0].failures == failures + ownership.owner_of.assert_called_once() + direct_fs_access_crawler.snapshot.assert_called_once() + used_tables_crawler.snapshot.assert_called_once() From d9302035ca3f0b724416b440b58b4fb95db1248b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:10:33 +0100 Subject: [PATCH 009/129] Test dashboard failure coming from dfsa --- tests/unit/progress/test_dashboards.py | 65 +++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 014b04567e..53fbbf105f 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -6,7 +6,8 @@ from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.progress.dashboards import DashboardProgressEncoder -from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.base import LineageAtom +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, DirectFsAccessCrawler from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler @@ -29,7 +30,22 @@ def test_dashboard_progress_encoder_no_failures() -> None: ) ownership = create_autospec(Ownership) ownership.owner_of.return_value = "user" + dfsa = DirectFsAccess( + source_id="/path/to/write_dfsa.py", + source_lineage=[ + LineageAtom( + object_type="DASHBOARD", + object_id="12345", # Not a match with dashboard below, hence no failure + other={"parent": "parent", "name": "dashboard"}, + ), + LineageAtom(object_type="QUERY", object_id="did/qid", other={"name": "query"}), + ], + path="dfsa:/path/to/data/", + is_read=False, + is_write=True, + ) direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + direct_fs_access_crawler.snapshot.return_value = [dfsa] used_tables_crawler = create_autospec(UsedTablesCrawler) encoder = DashboardProgressEncoder( mock_backend, @@ -97,3 +113,50 @@ def test_dashboard_progress_encoder_query_problem_as_failure() -> None: ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() + + +def test_dashboard_progress_encoder_direct_filesystem_access(mock_backend) -> None: + failures = [ + "[direct-filesystem-access] query (did/qid) : " + "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", + ] + + ownership = create_autospec(Ownership) + ownership.owner_of.return_value = "user" + dfsa = DirectFsAccess( + source_id="/path/to/write_dfsa.py", + source_lineage=[ + LineageAtom( + object_type="DASHBOARD", + object_id="did", + other={"parent": "parent", "name": "dashboard"}, + ), + LineageAtom(object_type="QUERY", object_id="did/qid", other={"name": "query"}), + ], + path="dfsa:/path/to/data/", + is_read=False, + is_write=True, + ) + direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + direct_fs_access_crawler.snapshot.return_value = [dfsa] + used_tables_crawler = create_autospec(UsedTablesCrawler) + encoder = DashboardProgressEncoder( + mock_backend, + ownership, + [direct_fs_access_crawler], + [used_tables_crawler], + inventory_database="inventory", + run_id=1, + workspace_id=123456789, + catalog="test", + ) + dashboard = Dashboard("did") + + encoder.append_inventory_snapshot([dashboard]) + + rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") + assert rows, f"No rows written for: {encoder.full_name}" + assert rows[0].failures == failures + ownership.owner_of.assert_called_once() + direct_fs_access_crawler.snapshot.assert_called_once() + used_tables_crawler.snapshot.assert_called_once() From 3099f8172d000400a880f7acf0d2e5d5039d0ee0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:20:19 +0100 Subject: [PATCH 010/129] Rewrite tests to assert on historical rows --- .../labs/ucx/progress/dashboards.py | 4 +- tests/unit/progress/test_dashboards.py | 87 +++++++++++++------ 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 4ba5e30d86..27f0a85aa8 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -32,7 +32,7 @@ def __init__( direct_fs_access_crawlers: list[DirectFsAccessCrawler], used_tables_crawlers: list[UsedTablesCrawler], inventory_database: str, - run_id: int, + job_run_id: int, workspace_id: int, catalog: str, ) -> None: @@ -40,7 +40,7 @@ def __init__( sql_backend, ownership, Dashboard, - run_id, + job_run_id, workspace_id, catalog, "multiworkspace", diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 53fbbf105f..d3744678e1 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -2,6 +2,7 @@ from databricks.labs.lsql.backends import MockBackend, Row +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -12,11 +13,24 @@ def test_dashboard_progress_encoder_no_failures() -> None: + expected = [ + Row( + workspace_id=123456789, + job_run_id=1, + object_type="Dashboard", + object_id=["did1"], + data={"id": "did1"}, + failures=[], + owner="cor", + ucx_version=ucx_version, + ) + ] + mock_backend = MockBackend( rows={ "SELECT \\* FROM query_problems": [ Row( - dashboard_id="12345", # Not a match with dashboard below, hence no failure + dashboard_id="did2", # Not a match with dashboard below, hence no failure dashboard_parent="dashbards/parent", dashboard_name="my_dashboard", query_id="23456", @@ -29,16 +43,16 @@ def test_dashboard_progress_encoder_no_failures() -> None: } ) ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "user" + ownership.owner_of.return_value = "cor" dfsa = DirectFsAccess( source_id="/path/to/write_dfsa.py", source_lineage=[ LineageAtom( object_type="DASHBOARD", - object_id="12345", # Not a match with dashboard below, hence no failure + object_id="did3", # Not a match with dashboard below, hence no failure other={"parent": "parent", "name": "dashboard"}, ), - LineageAtom(object_type="QUERY", object_id="did/qid", other={"name": "query"}), + LineageAtom(object_type="QUERY", object_id="did3/qid1", other={"name": "query"}), ], path="dfsa:/path/to/data/", is_read=False, @@ -53,34 +67,44 @@ def test_dashboard_progress_encoder_no_failures() -> None: [direct_fs_access_crawler], [used_tables_crawler], inventory_database="inventory", - run_id=1, + job_run_id=1, workspace_id=123456789, catalog="test", ) - dashboard = Dashboard("did") + dashboard = Dashboard("did1") encoder.append_inventory_snapshot([dashboard]) rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows, f"No rows written for: {encoder.full_name}" - assert len(rows[0].failures) == 0 + assert rows == expected ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() def test_dashboard_progress_encoder_query_problem_as_failure() -> None: - failures = ["[sql-parse-error] my_query (12345/23456) : Could not parse SQL"] + expected = [ + Row( + workspace_id=123456789, + job_run_id=1, + object_type="Dashboard", + object_id=["did1"], + data={"id": "did1"}, + failures=["[sql-parse-error] my_query (did1/qid1) : Could not parse SQL"], + owner="cor", + ucx_version=ucx_version, + ) + ] mock_backend = MockBackend( rows={ # The Mockbackend.fetch ignores the catalog and schema keyword arguments "SELECT \\* FROM query_problems": [ Row( - dashboard_id="12345", + dashboard_id="did1", dashboard_parent="dashbards/parent", dashboard_name="my_dashboard", - query_id="23456", + query_id="qid1", query_parent="queries/parent", query_name="my_query", code="sql-parse-error", @@ -90,7 +114,7 @@ def test_dashboard_progress_encoder_query_problem_as_failure() -> None: } ) ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "user" + ownership.owner_of.return_value = "cor" direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) used_tables_crawler = create_autospec(UsedTablesCrawler) encoder = DashboardProgressEncoder( @@ -99,39 +123,49 @@ def test_dashboard_progress_encoder_query_problem_as_failure() -> None: [direct_fs_access_crawler], [used_tables_crawler], inventory_database="inventory", - run_id=1, + job_run_id=1, workspace_id=123456789, catalog="test", ) - dashboard = Dashboard("12345") + dashboard = Dashboard("did1") encoder.append_inventory_snapshot([dashboard]) rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows, f"No rows written for: {encoder.full_name}" - assert rows[0].failures == failures + assert rows == expected ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() -def test_dashboard_progress_encoder_direct_filesystem_access(mock_backend) -> None: - failures = [ - "[direct-filesystem-access] query (did/qid) : " - "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", +def test_dashboard_progress_encoder_direct_filesystem_access_as_failures(mock_backend) -> None: + expected = [ + Row( + workspace_id=123456789, + job_run_id=1, + object_type="Dashboard", + object_id=["did1"], + data={"id": "did1"}, + failures=[ + "[direct-filesystem-access] query (did1/qid1) : " + "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", + ], + owner="cor", + ucx_version=ucx_version, + ) ] ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "user" + ownership.owner_of.return_value = "cor" dfsa = DirectFsAccess( source_id="/path/to/write_dfsa.py", source_lineage=[ LineageAtom( object_type="DASHBOARD", - object_id="did", + object_id="did1", other={"parent": "parent", "name": "dashboard"}, ), - LineageAtom(object_type="QUERY", object_id="did/qid", other={"name": "query"}), + LineageAtom(object_type="QUERY", object_id="did1/qid1", other={"name": "query"}), ], path="dfsa:/path/to/data/", is_read=False, @@ -146,17 +180,16 @@ def test_dashboard_progress_encoder_direct_filesystem_access(mock_backend) -> No [direct_fs_access_crawler], [used_tables_crawler], inventory_database="inventory", - run_id=1, + job_run_id=1, workspace_id=123456789, catalog="test", ) - dashboard = Dashboard("did") + dashboard = Dashboard("did1") encoder.append_inventory_snapshot([dashboard]) rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows, f"No rows written for: {encoder.full_name}" - assert rows[0].failures == failures + assert rows == expected ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() From 600d73a7e7914bca9734588bed3014c568036aa1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:31:25 +0100 Subject: [PATCH 011/129] Test used table failure from hive metastore table --- .../labs/ucx/hive_metastore/tables.py | 2 +- .../labs/ucx/progress/dashboards.py | 4 +- tests/unit/progress/test_dashboards.py | 79 ++++++++++++++++++- 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index c96106bd80..5b40f0a284 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -94,8 +94,8 @@ def from_historical_data(cls, data: dict[str, str]) -> Table: "catalog": data["catalog"], "database": data["database"], "name": data["name"], + "object_type": data["object_type"], "table_format": data["table_format"], - "location": data["table_format"], } if "location" in data: kwargs["location"] = data["location"] diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 27f0a85aa8..d082f4e918 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -117,7 +117,9 @@ def _get_used_tables(self) -> dict[str, list[UsedTable]]: def _get_tables_failures(self) -> DashboardIdToFailuresType: table_failures = {} for row in self._sql_backend.fetch( - f"SELECT * FROM `{self._catalog}`.`{self._schema}`.`objects_snapshot` WHERE object_type = 'Table'" + f"SELECT * FROM objects_snapshot WHERE object_type = 'Table'", + catalog=self._catalog, + schema=self._schema, ): historical = Historical(**row.asDict()) table = Table.from_historical_data(historical.data) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index d3744678e1..e472c961ba 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.progress.dashboards import DashboardProgressEncoder from databricks.labs.ucx.source_code.base import LineageAtom from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, DirectFsAccessCrawler -from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler +from databricks.labs.ucx.source_code.used_table import UsedTable, UsedTablesCrawler def test_dashboard_progress_encoder_no_failures() -> None: @@ -193,3 +193,80 @@ def test_dashboard_progress_encoder_direct_filesystem_access_as_failures(mock_ba ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() + + +def test_dashboard_progress_encoder_used_hive_table_as_failures() -> None: + expected = [ + Row( + workspace_id=123456789, + job_run_id=1, + object_type="Dashboard", + object_id=["did1"], + data={"id": "did1"}, + failures=["Used by TABLE: hive_metastore.schema.table"], + owner="cor", + ucx_version=ucx_version, + ) + ] + + mock_backend = MockBackend( + rows={ + # The Mockbackend.fetch ignores the catalog and schema keyword arguments + "SELECT \\* FROM objects_snapshot WHERE object_type = 'Table'": [ + Row( + workspace_id=123456789, + job_run_id=1, + object_type="Table", + object_id=["hive_metastore", "schema", "table"], + data={ + "catalog": "hive_metastore", + "database": "schema", + "name": "table", + "object_type": "TABLE", + "table_format": "DELTA", + }, + failures=["Used by TABLE: hive_metastore.schema.table"], + owner="cor", + ucx_version=ucx_version, + ) + ] + } + ) + ownership = create_autospec(Ownership) + ownership.owner_of.return_value = "cor" + direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + used_table = UsedTable( + catalog_name="hive_metastore", + schema_name="schema", + table_name="table", + source_id="did1/qid1", + source_lineage=[ + LineageAtom( + object_type="DASHBOARD", + object_id="did1", + other={"parent": "parent", "name": "dashboard"}, + ), + LineageAtom(object_type="QUERY", object_id="did1/qid1", other={"name": "query"}), + ], + ) + used_tables_crawler = create_autospec(UsedTablesCrawler) + used_tables_crawler.snapshot.return_value = [used_table] + encoder = DashboardProgressEncoder( + mock_backend, + ownership, + [direct_fs_access_crawler], + [used_tables_crawler], + inventory_database="inventory", + job_run_id=1, + workspace_id=123456789, + catalog="test", + ) + dashboard = Dashboard("did1") + + encoder.append_inventory_snapshot([dashboard]) + + rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") + assert rows == expected + ownership.owner_of.assert_called_once() + direct_fs_access_crawler.snapshot.assert_called_once() + used_tables_crawler.snapshot.assert_called_once() From b9d6b0db94ff92404a7c316143ee5e43d0a545b2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:39:35 +0100 Subject: [PATCH 012/129] Merge tests --- tests/unit/progress/test_dashboards.py | 228 ++++++------------------- 1 file changed, 56 insertions(+), 172 deletions(-) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index e472c961ba..00af7d9139 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -1,5 +1,6 @@ from unittest.mock import create_autospec +import pytest from databricks.labs.lsql.backends import MockBackend, Row from databricks.labs.ucx.__about__ import __version__ as ucx_version @@ -12,8 +13,9 @@ from databricks.labs.ucx.source_code.used_table import UsedTable, UsedTablesCrawler -def test_dashboard_progress_encoder_no_failures() -> None: - expected = [ +@pytest.mark.parametrize( + "expected", + [ Row( workspace_id=123456789, job_run_id=1, @@ -23,195 +25,60 @@ def test_dashboard_progress_encoder_no_failures() -> None: failures=[], owner="cor", ucx_version=ucx_version, - ) - ] - - mock_backend = MockBackend( - rows={ - "SELECT \\* FROM query_problems": [ - Row( - dashboard_id="did2", # Not a match with dashboard below, hence no failure - dashboard_parent="dashbards/parent", - dashboard_name="my_dashboard", - query_id="23456", - query_parent="queries/parent", - query_name="my_query", - code="sql-parse-error", - message="Could not parse SQL", - ) - ] - } - ) - ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "cor" - dfsa = DirectFsAccess( - source_id="/path/to/write_dfsa.py", - source_lineage=[ - LineageAtom( - object_type="DASHBOARD", - object_id="did3", # Not a match with dashboard below, hence no failure - other={"parent": "parent", "name": "dashboard"}, - ), - LineageAtom(object_type="QUERY", object_id="did3/qid1", other={"name": "query"}), - ], - path="dfsa:/path/to/data/", - is_read=False, - is_write=True, - ) - direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) - direct_fs_access_crawler.snapshot.return_value = [dfsa] - used_tables_crawler = create_autospec(UsedTablesCrawler) - encoder = DashboardProgressEncoder( - mock_backend, - ownership, - [direct_fs_access_crawler], - [used_tables_crawler], - inventory_database="inventory", - job_run_id=1, - workspace_id=123456789, - catalog="test", - ) - dashboard = Dashboard("did1") - - encoder.append_inventory_snapshot([dashboard]) - - rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows == expected - ownership.owner_of.assert_called_once() - direct_fs_access_crawler.snapshot.assert_called_once() - used_tables_crawler.snapshot.assert_called_once() - - -def test_dashboard_progress_encoder_query_problem_as_failure() -> None: - expected = [ + ), Row( workspace_id=123456789, job_run_id=1, object_type="Dashboard", - object_id=["did1"], - data={"id": "did1"}, - failures=["[sql-parse-error] my_query (did1/qid1) : Could not parse SQL"], + object_id=["did2"], + data={"id": "did2"}, + failures=["[sql-parse-error] my_query (did2/qid1) : Could not parse SQL"], owner="cor", ucx_version=ucx_version, - ) - ] - - mock_backend = MockBackend( - rows={ - # The Mockbackend.fetch ignores the catalog and schema keyword arguments - "SELECT \\* FROM query_problems": [ - Row( - dashboard_id="did1", - dashboard_parent="dashbards/parent", - dashboard_name="my_dashboard", - query_id="qid1", - query_parent="queries/parent", - query_name="my_query", - code="sql-parse-error", - message="Could not parse SQL", - ) - ] - } - ) - ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "cor" - direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) - used_tables_crawler = create_autospec(UsedTablesCrawler) - encoder = DashboardProgressEncoder( - mock_backend, - ownership, - [direct_fs_access_crawler], - [used_tables_crawler], - inventory_database="inventory", - job_run_id=1, - workspace_id=123456789, - catalog="test", - ) - dashboard = Dashboard("did1") - - encoder.append_inventory_snapshot([dashboard]) - - rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows == expected - ownership.owner_of.assert_called_once() - direct_fs_access_crawler.snapshot.assert_called_once() - used_tables_crawler.snapshot.assert_called_once() - - -def test_dashboard_progress_encoder_direct_filesystem_access_as_failures(mock_backend) -> None: - expected = [ + ), Row( workspace_id=123456789, job_run_id=1, object_type="Dashboard", - object_id=["did1"], - data={"id": "did1"}, + object_id=["did3"], + data={"id": "did3"}, failures=[ - "[direct-filesystem-access] query (did1/qid1) : " + "[direct-filesystem-access] query (did3/qid1) : " "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", ], owner="cor", ucx_version=ucx_version, - ) - ] - - ownership = create_autospec(Ownership) - ownership.owner_of.return_value = "cor" - dfsa = DirectFsAccess( - source_id="/path/to/write_dfsa.py", - source_lineage=[ - LineageAtom( - object_type="DASHBOARD", - object_id="did1", - other={"parent": "parent", "name": "dashboard"}, - ), - LineageAtom(object_type="QUERY", object_id="did1/qid1", other={"name": "query"}), - ], - path="dfsa:/path/to/data/", - is_read=False, - is_write=True, - ) - direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) - direct_fs_access_crawler.snapshot.return_value = [dfsa] - used_tables_crawler = create_autospec(UsedTablesCrawler) - encoder = DashboardProgressEncoder( - mock_backend, - ownership, - [direct_fs_access_crawler], - [used_tables_crawler], - inventory_database="inventory", - job_run_id=1, - workspace_id=123456789, - catalog="test", - ) - dashboard = Dashboard("did1") - - encoder.append_inventory_snapshot([dashboard]) - - rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows == expected - ownership.owner_of.assert_called_once() - direct_fs_access_crawler.snapshot.assert_called_once() - used_tables_crawler.snapshot.assert_called_once() - - -def test_dashboard_progress_encoder_used_hive_table_as_failures() -> None: - expected = [ + ), Row( workspace_id=123456789, job_run_id=1, object_type="Dashboard", - object_id=["did1"], - data={"id": "did1"}, + object_id=["did4"], + data={"id": "did4"}, failures=["Used by TABLE: hive_metastore.schema.table"], owner="cor", ucx_version=ucx_version, - ) + ), ] - +) +def test_dashboard_progress_encoder(expected: Row) -> None: + # The Mockbackend.fetch ignores the catalog and schema keyword arguments mock_backend = MockBackend( rows={ - # The Mockbackend.fetch ignores the catalog and schema keyword arguments + # Expect a query problem for dashboard two + "SELECT \\* FROM query_problems": [ + Row( + dashboard_id="did2", + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", + query_id="qid1", + query_parent="queries/parent", + query_name="my_query", + code="sql-parse-error", + message="Could not parse SQL", + ) + ], + # A Hive table used by dashboard 4 "SELECT \\* FROM objects_snapshot WHERE object_type = 'Table'": [ Row( workspace_id=123456789, @@ -229,24 +96,41 @@ def test_dashboard_progress_encoder_used_hive_table_as_failures() -> None: owner="cor", ucx_version=ucx_version, ) - ] + ], } ) ownership = create_autospec(Ownership) ownership.owner_of.return_value = "cor" + # Expect a direct filesystem failure message for dashboard 3 + dfsa = DirectFsAccess( + source_id="/path/to/write_dfsa.py", + source_lineage=[ + LineageAtom( + object_type="DASHBOARD", + object_id="did3", + other={"parent": "parent", "name": "dashboard"}, + ), + LineageAtom(object_type="QUERY", object_id="did3/qid1", other={"name": "query"}), + ], + path="dfsa:/path/to/data/", + is_read=False, + is_write=True, + ) direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) + direct_fs_access_crawler.snapshot.return_value = [dfsa] + # Expect a used Hive table failure message for dashboard 4 used_table = UsedTable( catalog_name="hive_metastore", schema_name="schema", table_name="table", - source_id="did1/qid1", + source_id="did4/qid1", source_lineage=[ LineageAtom( object_type="DASHBOARD", - object_id="did1", + object_id="did4", other={"parent": "parent", "name": "dashboard"}, ), - LineageAtom(object_type="QUERY", object_id="did1/qid1", other={"name": "query"}), + LineageAtom(object_type="QUERY", object_id="did4/qid1", other={"name": "query"}), ], ) used_tables_crawler = create_autospec(UsedTablesCrawler) @@ -261,12 +145,12 @@ def test_dashboard_progress_encoder_used_hive_table_as_failures() -> None: workspace_id=123456789, catalog="test", ) - dashboard = Dashboard("did1") + dashboard = Dashboard(expected.object_id[0]) encoder.append_inventory_snapshot([dashboard]) rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") - assert rows == expected + assert rows == [expected] ownership.owner_of.assert_called_once() direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() From 3ac499a2037f107e3e91d33748e3648dfd7e2e62 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:52:15 +0100 Subject: [PATCH 013/129] Test Table.from_historical --- tests/unit/hive_metastore/test_tables.py | 61 ++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 4d153be113..c502157a2d 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -27,6 +27,67 @@ from databricks.labs.ucx.workspace_access.groups import GroupManager +@pytest.mark.parametrize( + "data, expected", + [ + ( + { + "catalog": "catalog", + "database": "database", + "name": "name", + "object_type": "TABLE", + "table_format": "DELTA", + }, + Table("catalog", "database", "name", "TABLE", "DELTA"), + ), + ( + { + "catalog": "catalog", + "database": "database", + "name": "name", + "object_type": "TABLE", + "table_format": "DELTA", + "location": "dbfs://folder", + "upgraded_to": "catalog.schema.name", + "storage_properties": "property", + "is_partitioned": "false", + }, + Table( + "catalog", + "database", + "name", + "TABLE", + "DELTA", + "dbfs://folder", + upgraded_to="catalog.schema.name", + storage_properties="property", + is_partitioned=False, + ), + ), + ( + { + "catalog": "catalog", + "database": "database", + "name": "name", + "object_type": "VIEW", + "table_format": "UNKNOWN", + "view_text": "SELECT 1", + }, + Table( + "catalog", + "database", + "name", + "VIEW", + "UNKNOWN", + view_text="SELECT 1", + ), + ), + ], +) +def test_table_from_historical_data(data: dict[str, str], expected: Table) -> None: + assert Table.from_historical_data(data) == expected + + def test_is_delta_true(): delta_table = Table(catalog="catalog", database="db", name="table", object_type="type", table_format="DELTA") assert delta_table.is_delta From eba3a694e261492583d21f12453ca800b002f4c8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:52:23 +0100 Subject: [PATCH 014/129] Format --- src/databricks/labs/ucx/progress/dashboards.py | 2 +- tests/unit/progress/test_dashboards.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index d082f4e918..633b5e825d 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -117,7 +117,7 @@ def _get_used_tables(self) -> dict[str, list[UsedTable]]: def _get_tables_failures(self) -> DashboardIdToFailuresType: table_failures = {} for row in self._sql_backend.fetch( - f"SELECT * FROM objects_snapshot WHERE object_type = 'Table'", + "SELECT * FROM objects_snapshot WHERE object_type = 'Table'", catalog=self._catalog, schema=self._schema, ): diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 00af7d9139..8bafac4e24 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -59,7 +59,7 @@ owner="cor", ucx_version=ucx_version, ), - ] + ], ) def test_dashboard_progress_encoder(expected: Row) -> None: # The Mockbackend.fetch ignores the catalog and schema keyword arguments From 1ebe6a529486569f1a7f3feab6ee172edd4f390d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 11 Dec 2024 13:54:50 +0100 Subject: [PATCH 015/129] Assert row in integration test --- tests/integration/progress/test_dashboards.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/integration/progress/test_dashboards.py b/tests/integration/progress/test_dashboards.py index 1964784c8e..513ea2cea1 100644 --- a/tests/integration/progress/test_dashboards.py +++ b/tests/integration/progress/test_dashboards.py @@ -1,8 +1,20 @@ +from databricks.labs.lsql.backends import Row + +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.utils import escape_sql_identifier def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx) -> None: - failures = [] + row = Row( + workspace_id=123456789, + job_run_id=1, + object_type="Dashboard", + object_id=["did1"], + data={"id": "did1"}, + failures=[], + owner="cor", + ucx_version=ucx_version, + ) az_cli_ctx.progress_tracking_installation.run() runtime_ctx = runtime_ctx.replace( parent_run_id=1, @@ -16,5 +28,4 @@ def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx) -> N history_table_name = escape_sql_identifier(runtime_ctx.tables_progress.full_name) records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}")) - assert len(records) == 1, "Expected one historical entry" - assert records[0].failures == failures + assert records == [row] From 8179518fdfe51cbb34693d578d92cda47fe3800e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 09:45:28 +0100 Subject: [PATCH 016/129] Add default attributes to expected data --- tests/unit/progress/test_dashboards.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 8bafac4e24..f96fd680e7 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -21,7 +21,7 @@ job_run_id=1, object_type="Dashboard", object_id=["did1"], - data={"id": "did1"}, + data={"id": "did1", "query_ids": "[]", "tags": "[]"}, failures=[], owner="cor", ucx_version=ucx_version, @@ -31,7 +31,7 @@ job_run_id=1, object_type="Dashboard", object_id=["did2"], - data={"id": "did2"}, + data={"id": "did2", "query_ids": "[]", "tags": "[]"}, failures=["[sql-parse-error] my_query (did2/qid1) : Could not parse SQL"], owner="cor", ucx_version=ucx_version, @@ -41,7 +41,7 @@ job_run_id=1, object_type="Dashboard", object_id=["did3"], - data={"id": "did3"}, + data={"id": "did3", "query_ids": "[]", "tags": "[]"}, failures=[ "[direct-filesystem-access] query (did3/qid1) : " "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", @@ -54,7 +54,7 @@ job_run_id=1, object_type="Dashboard", object_id=["did4"], - data={"id": "did4"}, + data={"id": "did4", "query_ids": "[]", "tags": "[]"}, failures=["Used by TABLE: hive_metastore.schema.table"], owner="cor", ucx_version=ucx_version, From b6922cea965b2d06ada20c1e805095fb57aba7b5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 09:45:38 +0100 Subject: [PATCH 017/129] Add id attributes to dashboard --- src/databricks/labs/ucx/assessment/dashboards.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/dashboards.py b/src/databricks/labs/ucx/assessment/dashboards.py index 66a472c8e5..b429fd2961 100644 --- a/src/databricks/labs/ucx/assessment/dashboards.py +++ b/src/databricks/labs/ucx/assessment/dashboards.py @@ -5,6 +5,7 @@ import logging from collections.abc import Iterable, Iterator from dataclasses import dataclass, field +from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend from databricks.labs.lsql.lakeview import Dashboard as LsqlLakeviewDashboard, Dataset @@ -90,6 +91,8 @@ def from_lakeview_dataset(cls, dataset: Dataset, *, parent: str | None = None) - class Dashboard: """UCX representation of a dashboard.""" + __id_attributes__: ClassVar[tuple[str, ...]] = ("id",) + id: str """The ID for this dashboard.""" From 252c9781dc08826bd6307caee9e3536292144851 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 10:13:16 +0100 Subject: [PATCH 018/129] Add dashboard ownership to GlobalContext --- src/databricks/labs/ucx/contexts/application.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 102fcdb432..fa773f1ed3 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -12,6 +12,7 @@ from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2 from databricks.labs.lsql.backends import SqlBackend +from databricks.labs.ucx.assessment.dashboards import DashboardOwnership from databricks.labs.ucx.assessment.jobs import JobsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler from databricks.labs.ucx.hive_metastore.pipelines_migrate import PipelinesMigrator @@ -266,6 +267,10 @@ def udfs_crawler(self) -> UdfsCrawler: def udf_ownership(self) -> UdfOwnership: return UdfOwnership(self.administrator_locator) + @cached_property + def dashboard_ownership(self) -> DashboardOwnership: + return DashboardOwnership(self.administrator_locator, self.workspace_client, self.workspace_path_ownership) + @cached_property def tables_crawler(self) -> TablesCrawler: return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) From 7d643a9a0cf6d15f35f86e9e11f88c660cd17f29 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 10:13:31 +0100 Subject: [PATCH 019/129] Add dashboard progress encoder to runtime context --- src/databricks/labs/ucx/contexts/workflow_task.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index d41730bed5..58b1dfd32f 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -6,6 +6,7 @@ from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ +from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, PoliciesCrawler, @@ -25,6 +26,7 @@ from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table from databricks.labs.ucx.hive_metastore.udfs import Udf from databricks.labs.ucx.installer.logs import TaskRunWarningRecorder +from databricks.labs.ucx.progress.dashboards import DashboardProgressEncoder from databricks.labs.ucx.progress.grants import Grant, GrantProgressEncoder from databricks.labs.ucx.progress.history import ProgressEncoder from databricks.labs.ucx.progress.jobs import JobsProgressEncoder @@ -244,6 +246,19 @@ def udfs_progress(self) -> ProgressEncoder[Udf]: self.config.ucx_catalog, ) + @cached_property + def dashboards_progress(self) -> ProgressEncoder[Dashboard]: + return DashboardProgressEncoder( + self.sql_backend, + self.dashboard_ownership, + [self.directfs_access_crawler_for_queries], + [self.used_tables_crawler_for_queries], + self.config.inventory_database, + self.parent_run_id, + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def migration_sequencer(self) -> MigrationSequencer: return MigrationSequencer(self.workspace_client, self.administrator_locator) From 71577c0e929339117ab0c082720c5f51f3f03fbd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 10:14:50 +0100 Subject: [PATCH 020/129] Force key word argument in dashboard progress encoder --- src/databricks/labs/ucx/contexts/workflow_task.py | 12 ++++++------ src/databricks/labs/ucx/progress/dashboards.py | 1 + tests/unit/progress/test_dashboards.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 58b1dfd32f..bc95eeb6eb 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -251,12 +251,12 @@ def dashboards_progress(self) -> ProgressEncoder[Dashboard]: return DashboardProgressEncoder( self.sql_backend, self.dashboard_ownership, - [self.directfs_access_crawler_for_queries], - [self.used_tables_crawler_for_queries], - self.config.inventory_database, - self.parent_run_id, - self.workspace_id, - self.config.ucx_catalog, + direct_fs_access_crawlers=[self.directfs_access_crawler_for_queries], + used_tables_crawlers=[self.used_tables_crawler_for_queries], + inventory_database=self.config.inventory_database, + job_run_id=self.parent_run_id, + workspace_id=self.workspace_id, + catalog=self.config.ucx_catalog, ) @cached_property diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 633b5e825d..01b9ded329 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -29,6 +29,7 @@ def __init__( self, sql_backend: SqlBackend, ownership: DashboardOwnership, + *, direct_fs_access_crawlers: list[DirectFsAccessCrawler], used_tables_crawlers: list[UsedTablesCrawler], inventory_database: str, diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index f96fd680e7..24f3e152c2 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -138,8 +138,8 @@ def test_dashboard_progress_encoder(expected: Row) -> None: encoder = DashboardProgressEncoder( mock_backend, ownership, - [direct_fs_access_crawler], - [used_tables_crawler], + direct_fs_access_crawlers=[direct_fs_access_crawler], + used_tables_crawlers=[used_tables_crawler], inventory_database="inventory", job_run_id=1, workspace_id=123456789, From fc63ef24f25ac0b3828869b577efeea71eaf7716 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 10:24:11 +0100 Subject: [PATCH 021/129] Update dashboard progress encoder integration test --- tests/integration/progress/test_dashboards.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/tests/integration/progress/test_dashboards.py b/tests/integration/progress/test_dashboards.py index 513ea2cea1..6c8054f254 100644 --- a/tests/integration/progress/test_dashboards.py +++ b/tests/integration/progress/test_dashboards.py @@ -1,31 +1,22 @@ -from databricks.labs.lsql.backends import Row - -from databricks.labs.ucx.__about__ import __version__ as ucx_version +from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.framework.utils import escape_sql_identifier -def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx) -> None: - row = Row( - workspace_id=123456789, - job_run_id=1, - object_type="Dashboard", - object_id=["did1"], - data={"id": "did1"}, - failures=[], - owner="cor", - ucx_version=ucx_version, - ) +def test_dashboard_progress_encoder_table_no_failures(runtime_ctx, az_cli_ctx) -> None: az_cli_ctx.progress_tracking_installation.run() runtime_ctx = runtime_ctx.replace( parent_run_id=1, sql_backend=az_cli_ctx.sql_backend, ucx_catalog=az_cli_ctx.ucx_catalog, ) - dashboard = az_cli_ctx.make_dashboard() + sdk_dashboard = runtime_ctx.make_dashboard() + dashboard = Dashboard.from_sdk_redash_dashboard(sdk_dashboard) + runtime_ctx.query_linter.refresh_report() - runtime_ctx.tables_progress.append_inventory_snapshot([dashboard]) + runtime_ctx.dashboards_progress.append_inventory_snapshot([dashboard]) history_table_name = escape_sql_identifier(runtime_ctx.tables_progress.full_name) records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}")) - assert records == [row] + assert len(records) == 1, "Expected one historical record" + assert records[0].failures == [] From 30c19ccf9afdc034f34fa8269145352c15a549eb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 10:58:14 +0100 Subject: [PATCH 022/129] Expect DFSA message to come from query problem --- .../labs/ucx/contexts/workflow_task.py | 1 - .../labs/ucx/progress/dashboards.py | 31 +---------------- tests/integration/progress/test_dashboards.py | 26 ++++++++++++-- tests/unit/progress/test_dashboards.py | 34 +++++++------------ 4 files changed, 36 insertions(+), 56 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index bc95eeb6eb..68309f0312 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -251,7 +251,6 @@ def dashboards_progress(self) -> ProgressEncoder[Dashboard]: return DashboardProgressEncoder( self.sql_backend, self.dashboard_ownership, - direct_fs_access_crawlers=[self.directfs_access_crawler_for_queries], used_tables_crawlers=[self.used_tables_crawler_for_queries], inventory_database=self.config.inventory_database, job_run_id=self.parent_run_id, diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 01b9ded329..90816bfe37 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -11,7 +11,6 @@ from databricks.labs.ucx.progress.history import ProgressEncoder from databricks.labs.ucx.progress.install import Historical from databricks.labs.ucx.source_code.base import UsedTable -from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.queries import QueryProblem from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler @@ -30,7 +29,6 @@ def __init__( sql_backend: SqlBackend, ownership: DashboardOwnership, *, - direct_fs_access_crawlers: list[DirectFsAccessCrawler], used_tables_crawlers: list[UsedTablesCrawler], inventory_database: str, job_run_id: int, @@ -48,16 +46,14 @@ def __init__( "historical", ) self._inventory_database = inventory_database - self._direct_fs_access_crawlers = direct_fs_access_crawlers self._used_tables_crawlers = used_tables_crawlers def append_inventory_snapshot(self, snapshot: Iterable[Dashboard]) -> None: query_problems = self._get_query_problems() - dfsas = self._get_direct_filesystem_accesses() table_failures = self._get_tables_failures() history_records = [] for record in snapshot: - history_record = self._encode_dashboard_as_historical(record, query_problems, dfsas, table_failures) + history_record = self._encode_dashboard_as_historical(record, query_problems, table_failures) history_records.append(history_record) logger.debug(f"Appending {len(history_records)} {self._klass} table record(s) to history.") # The mode is 'append'. This is documented as conflict-free. @@ -77,29 +73,6 @@ def _get_query_problems(self) -> DashboardIdToFailuresType: index[problem.dashboard_id].append(failure) return index - def _get_direct_filesystem_accesses(self) -> DashboardIdToFailuresType: - index = collections.defaultdict(list) - for crawler in self._direct_fs_access_crawlers: - for direct_fs_access in crawler.snapshot(): - # The dashboard and query source lineage are added by the QueryLinter - if len(direct_fs_access.source_lineage) < 2: - continue - if direct_fs_access.source_lineage[0].object_type != "DASHBOARD": # Note: this skips dangling queries - continue - if direct_fs_access.source_lineage[1].object_type != "QUERY": - continue - dashboard_id = direct_fs_access.source_lineage[0].object_id - query_id = direct_fs_access.source_lineage[1].object_id # / - query_name = "UNKNOWN" - if direct_fs_access.source_lineage[1].other and "name" in direct_fs_access.source_lineage[1].other: - query_name = direct_fs_access.source_lineage[1].other["name"] - # Follow same failure message structure as the QueryProblem above and DirectFsAccessPyLinter deprecation - code = "direct-filesystem-access" - message = f"The use of direct filesystem references is deprecated: {direct_fs_access.path}" - failure = f"[{code}] {query_name} ({query_id}) : {message}" - index[dashboard_id].append(failure) - return index - def _get_used_tables(self) -> dict[str, list[UsedTable]]: index = collections.defaultdict(list) for crawler in self._used_tables_crawlers: @@ -136,7 +109,6 @@ def _encode_dashboard_as_historical( self, record: Dashboard, query_problems: DashboardIdToFailuresType, - dfsas: DashboardIdToFailuresType, tables_failures: DashboardIdToFailuresType, ) -> Historical: """Encode a dashboard as a historical records. @@ -149,6 +121,5 @@ def _encode_dashboard_as_historical( historical = super()._encode_record_as_historical(record) failures = [] failures.extend(query_problems.get(record.id, [])) - failures.extend(dfsas.get(record.id, [])) failures.extend(tables_failures.get(record.id, [])) return replace(historical, failures=historical.failures + failures) diff --git a/tests/integration/progress/test_dashboards.py b/tests/integration/progress/test_dashboards.py index 6c8054f254..095575b8e9 100644 --- a/tests/integration/progress/test_dashboards.py +++ b/tests/integration/progress/test_dashboards.py @@ -1,15 +1,30 @@ +import pytest + from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.framework.utils import escape_sql_identifier -def test_dashboard_progress_encoder_table_no_failures(runtime_ctx, az_cli_ctx) -> None: +@pytest.mark.parametrize( + "query, failures", + [ + ("SELECT 1", []), + ( + "SELECT * from csv.`dbfs://some_folder/some_file.csv`", + [ + "[direct-filesystem-access-in-sql-query] {query_name} ({dashboard_id}/{query_id}) : The use of direct filesystem references is deprecated: dbfs://some_folder/some_file.csv" + ], + ), + ], +) +def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx, query: str, failures: list[str]) -> None: az_cli_ctx.progress_tracking_installation.run() runtime_ctx = runtime_ctx.replace( parent_run_id=1, sql_backend=az_cli_ctx.sql_backend, ucx_catalog=az_cli_ctx.ucx_catalog, ) - sdk_dashboard = runtime_ctx.make_dashboard() + legacy_query = runtime_ctx.make_query(sql_query=query) + sdk_dashboard = runtime_ctx.make_dashboard(query=legacy_query) dashboard = Dashboard.from_sdk_redash_dashboard(sdk_dashboard) runtime_ctx.query_linter.refresh_report() @@ -19,4 +34,9 @@ def test_dashboard_progress_encoder_table_no_failures(runtime_ctx, az_cli_ctx) - records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}")) assert len(records) == 1, "Expected one historical record" - assert records[0].failures == [] + + expected = [] + for failure in failures: + message = failure.format(query_id=legacy_query.id, query_name=legacy_query.name, dashboard_id=dashboard.id) + expected.append(message) + assert records[0].failures == expected diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 24f3e152c2..5c17001afc 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -9,7 +9,6 @@ from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.progress.dashboards import DashboardProgressEncoder from databricks.labs.ucx.source_code.base import LineageAtom -from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess, DirectFsAccessCrawler from databricks.labs.ucx.source_code.used_table import UsedTable, UsedTablesCrawler @@ -43,7 +42,7 @@ object_id=["did3"], data={"id": "did3", "query_ids": "[]", "tags": "[]"}, failures=[ - "[direct-filesystem-access] query (did3/qid1) : " + "[direct-filesystem-access-in-query] my_query (did3/qid1) : " "The use of direct filesystem references is deprecated: dfsa:/path/to/data/", ], owner="cor", @@ -76,7 +75,17 @@ def test_dashboard_progress_encoder(expected: Row) -> None: query_name="my_query", code="sql-parse-error", message="Could not parse SQL", - ) + ), + Row( + dashboard_id="did3", + dashboard_parent="dashbards/parent", + dashboard_name="my_dashboard", + query_id="qid1", + query_parent="queries/parent", + query_name="my_query", + code="direct-filesystem-access-in-query", + message="The use of direct filesystem references is deprecated: dfsa:/path/to/data/", + ), ], # A Hive table used by dashboard 4 "SELECT \\* FROM objects_snapshot WHERE object_type = 'Table'": [ @@ -101,23 +110,6 @@ def test_dashboard_progress_encoder(expected: Row) -> None: ) ownership = create_autospec(Ownership) ownership.owner_of.return_value = "cor" - # Expect a direct filesystem failure message for dashboard 3 - dfsa = DirectFsAccess( - source_id="/path/to/write_dfsa.py", - source_lineage=[ - LineageAtom( - object_type="DASHBOARD", - object_id="did3", - other={"parent": "parent", "name": "dashboard"}, - ), - LineageAtom(object_type="QUERY", object_id="did3/qid1", other={"name": "query"}), - ], - path="dfsa:/path/to/data/", - is_read=False, - is_write=True, - ) - direct_fs_access_crawler = create_autospec(DirectFsAccessCrawler) - direct_fs_access_crawler.snapshot.return_value = [dfsa] # Expect a used Hive table failure message for dashboard 4 used_table = UsedTable( catalog_name="hive_metastore", @@ -138,7 +130,6 @@ def test_dashboard_progress_encoder(expected: Row) -> None: encoder = DashboardProgressEncoder( mock_backend, ownership, - direct_fs_access_crawlers=[direct_fs_access_crawler], used_tables_crawlers=[used_tables_crawler], inventory_database="inventory", job_run_id=1, @@ -152,5 +143,4 @@ def test_dashboard_progress_encoder(expected: Row) -> None: rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") assert rows == [expected] ownership.owner_of.assert_called_once() - direct_fs_access_crawler.snapshot.assert_called_once() used_tables_crawler.snapshot.assert_called_once() From d2f0e769f3245f3e61a23adc204f0857a99b2bc3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 12:00:06 +0100 Subject: [PATCH 023/129] Add from table info to Table --- .../labs/ucx/hive_metastore/tables.py | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 5b40f0a284..22e7c7307e 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -9,13 +9,13 @@ from functools import cached_property, partial import sqlglot -from sqlglot import expressions -from sqlglot.expressions import LocationProperty -from sqlglot.errors import ParseError - from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import NotFound +from databricks.sdk.service.catalog import TableInfo +from sqlglot import expressions +from sqlglot.expressions import LocationProperty +from sqlglot.errors import ParseError from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -88,6 +88,29 @@ def __post_init__(self) -> None: if isinstance(self.table_format, str): # Should not happen according to type hint, still safer self.table_format = self.table_format.upper() + @classmethod + def from_table_info(cls, table: TableInfo) -> Table: + if table.catalog_name is None or table.schema_name is None or table.name is None: + raise ValueError(f"Catalog, schema and table name are missing: {table}") + if table.table_type is None: + raise ValueError(f"Table type is missing: {table.table_type}") + if table.data_source_format is None: + raise ValueError(f"Data source format is missing: {table.data_source_format}") + kwargs: dict[str, str | bool] = { + "catalog": table.catalog_name, + "database": table.schema_name, + "name": table.name, + "object_type": table.table_type.value, + "table_format": table.data_source_format.value, + } + if table.storage_location: + kwargs["location"] = table.storage_location + if table.view_definition: + kwargs["view_text"] = table.view_definition + if table.properties and "upgraded_to" in table.properties: + kwargs["upgraded_to"] = bool(table.properties.get("upgraded_to")) + return cls(**kwargs) # type: ignore + @classmethod def from_historical_data(cls, data: dict[str, str]) -> Table: kwargs: dict[str, str | bool] = { From d52877e1f393f6218123ea70f76b18e94ddf7f90 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 13:34:12 +0100 Subject: [PATCH 024/129] Isort --- src/databricks/labs/ucx/hive_metastore/tables.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 22e7c7307e..83d254d0e0 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -9,13 +9,14 @@ from functools import cached_property, partial import sqlglot +from sqlglot import expressions +from sqlglot.expressions import LocationProperty +from sqlglot.errors import ParseError + from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import NotFound from databricks.sdk.service.catalog import TableInfo -from sqlglot import expressions -from sqlglot.expressions import LocationProperty -from sqlglot.errors import ParseError from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier From d983e36357156950b66054c72c6526d867d2b191 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 13:34:48 +0100 Subject: [PATCH 025/129] Remove used tables crawler from TableProgressEncoder The directionality will be the other way around: code points to used tables --- .../labs/ucx/contexts/workflow_task.py | 1 - src/databricks/labs/ucx/progress/tables.py | 22 +---------- tests/integration/progress/test_tables.py | 22 +---------- tests/unit/progress/test_tables.py | 37 ++----------------- 4 files changed, 6 insertions(+), 76 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 68309f0312..298d3b0ffa 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -229,7 +229,6 @@ def tables_progress(self) -> ProgressEncoder[Table]: self.sql_backend, self.table_ownership, self.migration_status_refresher, - [self.used_tables_crawler_for_paths, self.used_tables_crawler_for_queries], self.parent_run_id, self.workspace_id, self.config.ucx_catalog, diff --git a/src/databricks/labs/ucx/progress/tables.py b/src/databricks/labs/ucx/progress/tables.py index d1d6eab1a9..4aa52bac0c 100644 --- a/src/databricks/labs/ucx/progress/tables.py +++ b/src/databricks/labs/ucx/progress/tables.py @@ -1,5 +1,4 @@ import logging -from collections import defaultdict from collections.abc import Iterable from dataclasses import replace @@ -12,8 +11,6 @@ from databricks.labs.ucx.hive_metastore.ownership import TableOwnership from databricks.labs.ucx.progress.history import ProgressEncoder from databricks.labs.ucx.progress.install import Historical -from databricks.labs.ucx.source_code.base import UsedTable -from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler logger = logging.getLogger(__name__) @@ -27,7 +24,6 @@ def __init__( sql_backend: SqlBackend, ownership: TableOwnership, migration_status_refresher: CrawlerBase[TableMigrationStatus], - used_tables_crawlers: list[UsedTablesCrawler], run_id: int, workspace_id: int, catalog: str, @@ -43,30 +39,18 @@ def __init__( "historical", ) self._migration_status_refresher = migration_status_refresher - self._used_tables_crawlers = used_tables_crawlers def append_inventory_snapshot(self, snapshot: Iterable[Table]) -> None: migration_index = TableMigrationIndex(self._migration_status_refresher.snapshot()) - used_hive_tables = self._get_used_hive_tables() history_records = [] for record in snapshot: - history_record = self._encode_table_as_historical(record, migration_index, used_hive_tables) + history_record = self._encode_table_as_historical(record, migration_index) history_records.append(history_record) logger.debug(f"Appending {len(history_records)} {self._klass} table record(s) to history.") # The mode is 'append'. This is documented as conflict-free. self._sql_backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") - def _get_used_hive_tables(self) -> dict[str, list[UsedTable]]: - used_tables: dict[str, list[UsedTable]] = defaultdict(list[UsedTable]) - for crawler in self._used_tables_crawlers: - for used_table in crawler.snapshot(): - if used_table.catalog_name == "hive_metastore": - used_tables[used_table.full_name].append(used_table) - return used_tables - - def _encode_table_as_historical( - self, record: Table, migration_index: TableMigrationIndex, used_hive_tables: dict[str, list[UsedTable]] - ) -> Historical: + def _encode_table_as_historical(self, record: Table, migration_index: TableMigrationIndex) -> Historical: """Encode a table record, enriching with the migration status and used table references. Possible failures, the table is @@ -81,6 +65,4 @@ def _encode_table_as_historical( failures = [] if not migration_index.is_migrated(record.database, record.name): failures.append("Pending migration") - for used_table in used_hive_tables.get(record.full_name, []): - failures.append(f"Used by {used_table.source_type}: {used_table.source_id}") return replace(historical, failures=historical.failures + failures) diff --git a/tests/integration/progress/test_tables.py b/tests/integration/progress/test_tables.py index 69cd412c09..da94528724 100644 --- a/tests/integration/progress/test_tables.py +++ b/tests/integration/progress/test_tables.py @@ -1,26 +1,17 @@ -import datetime as dt - import pytest from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore.tables import Table -from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable @pytest.mark.parametrize("is_migrated_table", [True, False]) -@pytest.mark.parametrize("is_used_table", [True, False]) def test_table_progress_encoder_table_failures( runtime_ctx, az_cli_ctx, make_catalog, is_migrated_table: bool, - is_used_table: bool, ) -> None: - failures = [] - if not is_migrated_table: - failures.append("Pending migration") - if is_used_table: - failures.append("Used by NOTEBOOK: test/test.py") + failures = [] if is_migrated_table else ["Pending migration"] az_cli_ctx.progress_tracking_installation.run() runtime_ctx = runtime_ctx.replace( @@ -34,17 +25,6 @@ def test_table_progress_encoder_table_failures( hive_table_info = runtime_ctx.make_table(tbl_properties=hive_tbl_properties) uc_tbl_properties = {"upgraded_from": hive_table_info.full_name} if is_migrated_table else {} runtime_ctx.make_table(catalog_name=make_catalog().name, tbl_properties=uc_tbl_properties) - hive_used_table = UsedTable( - catalog_name="hive_metastore" if is_used_table else "catalog", - schema_name=hive_table_info.schema_name, - table_name=hive_table_info.name, - source_id="test/test.py", - source_timestamp=dt.datetime.now(tz=dt.timezone.utc), - source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="test/test.py")], - assessment_start_timestamp=dt.datetime.now(tz=dt.timezone.utc), - assessment_end_timestamp=dt.datetime.now(tz=dt.timezone.utc), - ) - runtime_ctx.used_tables_crawler_for_paths.dump_all([hive_used_table]) hive_table = Table( hive_table_info.catalog_name, diff --git a/tests/unit/progress/test_tables.py b/tests/unit/progress/test_tables.py index ef6447cdb5..770b7ae820 100644 --- a/tests/unit/progress/test_tables.py +++ b/tests/unit/progress/test_tables.py @@ -1,4 +1,3 @@ -import datetime as dt from unittest.mock import create_autospec from databricks.labs.ucx.framework.owners import Ownership @@ -9,8 +8,6 @@ ) from databricks.labs.ucx.hive_metastore.tables import Table from databricks.labs.ucx.progress.tables import TableProgressEncoder -from databricks.labs.ucx.source_code.base import LineageAtom, UsedTable -from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler def test_table_progress_encoder_no_failures(mock_backend) -> None: @@ -21,16 +18,8 @@ def test_table_progress_encoder_no_failures(mock_backend) -> None: migration_status_crawler.snapshot.return_value = ( TableMigrationStatus(table.database, table.name, "main", "default", table.name, update_ts=None), ) - used_tables_crawler = create_autospec(UsedTablesCrawler) - used_tables_crawler.snapshot.return_value = [] encoder = TableProgressEncoder( - mock_backend, - ownership, - migration_status_crawler, - [used_tables_crawler], - run_id=1, - workspace_id=123456789, - catalog="test", + mock_backend, ownership, migration_status_crawler, run_id=1, workspace_id=123456789, catalog="test" ) encoder.append_inventory_snapshot([table]) @@ -40,7 +29,6 @@ def test_table_progress_encoder_no_failures(mock_backend) -> None: assert len(rows[0].failures) == 0 ownership.owner_of.assert_called_once() migration_status_crawler.snapshot.assert_called_once() - used_tables_crawler.snapshot.assert_called_once() def test_table_progress_encoder_pending_migration_failure(mock_backend) -> None: @@ -51,33 +39,14 @@ def test_table_progress_encoder_pending_migration_failure(mock_backend) -> None: migration_status_crawler.snapshot.return_value = ( TableMigrationStatus(table.database, table.name), # No destination: therefore not yet migrated. ) - used_tables_crawler_for_paths = create_autospec(UsedTablesCrawler) - used_table = UsedTable( - catalog_name=table.catalog, - schema_name=table.database, - table_name=table.name, - source_id="test/test.py", - source_timestamp=dt.datetime.now(tz=dt.timezone.utc), - source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="test/test.py")], - assessment_start_timestamp=dt.datetime.now(tz=dt.timezone.utc), - assessment_end_timestamp=dt.datetime.now(tz=dt.timezone.utc), - ) - used_tables_crawler_for_paths.snapshot.return_value = [used_table] encoder = TableProgressEncoder( - mock_backend, - ownership, - migration_status_crawler, - [used_tables_crawler_for_paths], - run_id=1, - workspace_id=123456789, - catalog="test", + mock_backend, ownership, migration_status_crawler, run_id=1, workspace_id=123456789, catalog="test" ) encoder.append_inventory_snapshot([table]) rows = mock_backend.rows_written_for(escape_sql_identifier(encoder.full_name), "append") assert len(rows) > 0, f"No rows written for: {encoder.full_name}" - assert rows[0].failures == ["Pending migration", "Used by NOTEBOOK: test/test.py"] + assert rows[0].failures == ["Pending migration"] ownership.owner_of.assert_called_once() migration_status_crawler.snapshot.assert_called_once() - used_tables_crawler_for_paths.snapshot.assert_called_once() From a38ec1d3dab087b26cceb785b9b00ada6a00772b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 13:47:30 +0100 Subject: [PATCH 026/129] Pass failure from used table to dashboard --- .../labs/ucx/progress/dashboards.py | 3 +- tests/integration/progress/test_dashboards.py | 33 +++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/progress/dashboards.py b/src/databricks/labs/ucx/progress/dashboards.py index 90816bfe37..1f6d5d14ec 100644 --- a/src/databricks/labs/ucx/progress/dashboards.py +++ b/src/databricks/labs/ucx/progress/dashboards.py @@ -102,7 +102,8 @@ def _get_tables_failures(self) -> DashboardIdToFailuresType: used_tables = self._get_used_tables() for dashboard_id, used_tables_in_dashboard in used_tables.items(): for used_table in used_tables_in_dashboard: - index[dashboard_id].extend(table_failures.get(used_table.full_name, [])) + for failure in table_failures.get(used_table.full_name, []): + index[dashboard_id].append(f"{failure}: {used_table.full_name}") return index def _encode_dashboard_as_historical( diff --git a/tests/integration/progress/test_dashboards.py b/tests/integration/progress/test_dashboards.py index 095575b8e9..aca3b5c327 100644 --- a/tests/integration/progress/test_dashboards.py +++ b/tests/integration/progress/test_dashboards.py @@ -1,7 +1,9 @@ +import datetime as dt import pytest from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.labs.ucx.hive_metastore.tables import Table @pytest.mark.parametrize( @@ -9,34 +11,53 @@ [ ("SELECT 1", []), ( - "SELECT * from csv.`dbfs://some_folder/some_file.csv`", + "SELECT * FROM csv.`dbfs://some_folder/some_file.csv`", [ "[direct-filesystem-access-in-sql-query] {query_name} ({dashboard_id}/{query_id}) : The use of direct filesystem references is deprecated: dbfs://some_folder/some_file.csv" ], ), + ( + "SELECT * FROM {table_name}", + ["Pending migration: {table_name}"], + ), ], ) def test_dashboard_progress_encoder_table_failures(runtime_ctx, az_cli_ctx, query: str, failures: list[str]) -> None: az_cli_ctx.progress_tracking_installation.run() runtime_ctx = runtime_ctx.replace( parent_run_id=1, + named_parameters={ + "workflow": "migration-progress-experimental", + "job_id": "2", + "start_time": dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat(), + }, sql_backend=az_cli_ctx.sql_backend, ucx_catalog=az_cli_ctx.ucx_catalog, ) - legacy_query = runtime_ctx.make_query(sql_query=query) + table_info = runtime_ctx.make_table() + legacy_query = runtime_ctx.make_query(sql_query=query.format(table_name=table_info.full_name)) sdk_dashboard = runtime_ctx.make_dashboard(query=legacy_query) dashboard = Dashboard.from_sdk_redash_dashboard(sdk_dashboard) + + # Below is a minimal subset of the migration progress workflow tasks runtime_ctx.query_linter.refresh_report() + runtime_ctx.tables_progress.append_inventory_snapshot([Table.from_table_info(table_info)]) + runtime_ctx.workflow_run_recorder.record() runtime_ctx.dashboards_progress.append_inventory_snapshot([dashboard]) history_table_name = escape_sql_identifier(runtime_ctx.tables_progress.full_name) - records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name}")) + records = list(runtime_ctx.sql_backend.fetch(f"SELECT * FROM {history_table_name} WHERE object_type = 'Dashboard'")) - assert len(records) == 1, "Expected one historical record" + assert len(records) == 1, "Expected one historical dashboard record" expected = [] for failure in failures: - message = failure.format(query_id=legacy_query.id, query_name=legacy_query.name, dashboard_id=dashboard.id) + message = failure.format( + query_id=legacy_query.id, + query_name=legacy_query.name, + dashboard_id=dashboard.id, + table_name=table_info.full_name, + ) expected.append(message) - assert records[0].failures == expected + assert records[-1].failures == expected From 7daf8dd4470215566d67f5b1cc6ab7c2b40f8751 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 14:17:47 +0100 Subject: [PATCH 027/129] Persist dashboard migration progress in workflow --- src/databricks/labs/ucx/progress/workflows.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index ff7ffbedfb..5323d48929 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -141,12 +141,32 @@ def crawl_cluster_policies(self, ctx: RuntimeContext) -> None: history_log.append_inventory_snapshot(cluster_policies_snapshot) @job_task(depends_on=[verify_prerequisites]) + def crawl_redash_dashboards(self, ctx: RuntimeContext): + """Scans all Redash dashboards.""" + ctx.redash_crawler.snapshot(force_refresh=True) + + @job_task(depends_on=[verify_prerequisites]) + def crawl_lakeview_dashboards(self, ctx: RuntimeContext): + """Scans all Lakeview dashboards.""" + ctx.lakeview_crawler.snapshot(force_refresh=True) + + @job_task(depends_on=[crawl_redash_dashboards, crawl_lakeview_dashboards]) def assess_dashboards(self, ctx: RuntimeContext): - """Scans all dashboards for migration issues in SQL code of embedded widgets. - Also stores direct filesystem accesses for display in the migration dashboard.""" - # TODO: Ensure these are captured in the history log. + """Scans all dashboards for migration issues in SQL code of embedded widgets.""" ctx.query_linter.refresh_report() + @job_task(depends_on=[assess_dashboards], job_cluster="user_isolation") + def update_redash_dashboards_history_log(self, ctx: RuntimeContext) -> None: + """Update the history log with the latest Redash dashboards inventory snapshot.""" + redash_dashboards_snapshot = ctx.redash_crawler.snapshot(force_refresh=False) + ctx.dashboards_progress.append_inventory_snapshot(redash_dashboards_snapshot) + + @job_task(depends_on=[assess_dashboards], job_cluster="user_isolation") + def update_lakeview_dashboards_history_log(self, ctx: RuntimeContext) -> None: + """Update the history log with the latest Lakeview dashboards inventory snapshot.""" + lakeview_dashboards_snapshot = ctx.lakeview_crawler.snapshot(force_refresh=False) + ctx.dashboards_progress.append_inventory_snapshot(lakeview_dashboards_snapshot) + @job_task(depends_on=[verify_prerequisites]) def assess_workflows(self, ctx: RuntimeContext): """Scans all jobs for migration issues in notebooks. From 8e4d2c70cb6560bf5b06b5dc87af0c30227102bb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Mon, 16 Dec 2024 14:19:06 +0100 Subject: [PATCH 028/129] Force run integration test in CI --- tests/integration/progress/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/progress/test_workflows.py b/tests/integration/progress/test_workflows.py index c9e8ee0afa..7ddc2794dc 100644 --- a/tests/integration/progress/test_workflows.py +++ b/tests/integration/progress/test_workflows.py @@ -31,7 +31,7 @@ def test_running_real_migration_progress_job(installation_ctx: MockInstallationC installation_ctx.deployed_workflows.run_workflow(workflow) assert installation_ctx.deployed_workflows.validate_step(workflow), f"Workflow failed: {workflow}" - # Ensure that the migration-progress workflow populated the `workflow_runs` table. + # Ensure that the `migration-progress` workflow populated the `workflow_runs` table. query = f"SELECT 1 FROM {installation_ctx.ucx_catalog}.multiworkspace.workflow_runs LIMIT 1" assert any(installation_ctx.sql_backend.fetch(query)), f"No workflow run captured: {query}" From 684cb1e150941b67f73a59e4275044f5ef70a1e6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 10:35:11 +0100 Subject: [PATCH 029/129] Test Redash ownership is me --- tests/integration/assessment/test_dashboards.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index cf84afb4bf..cc711abe23 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -66,3 +66,13 @@ def test_lakeview_dashboard_crawler_crawls_dashboard( dashboards = list(crawler.snapshot()) assert dashboards == [Dashboard.from_sdk_lakeview_dashboard(dashboard)] + + +def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: + sdk_redash_dashboard = runtime_ctx.make_dashboard() + dashboard = Dashboard.from_sdk_redash_dashboard(sdk_redash_dashboard) + + owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) + + me = runtime_ctx.workspace_client.current_user.me() + assert owner == me.display_name From b0fd8f74267b327571b44b6ee905022ec1af9788 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 10:46:54 +0100 Subject: [PATCH 030/129] Test ownership of directory --- tests/integration/framework/test_owners.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index 756ddff362..f04d8a384a 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -83,3 +83,12 @@ def test_file_owner(make_workspace_file, ws): my_user = ws.current_user.me() assert name == my_user.user_name + + +def test_home_directory_owner_is_me(runtime_ctx) -> None: + me = runtime_ctx.workspace_client.current_user.me() + home = f"/Users/{me.user_name}" + + name = runtime_ctx.workspace_path_ownership.owner_of_path(home) + + assert name == me.user_name From af1fa79f66b5df4fae4493ad5502da94081e697e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 10:47:04 +0100 Subject: [PATCH 031/129] Support ownership of directory --- src/databricks/labs/ucx/framework/owners.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 55a1ddac98..d8efd6fdb9 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -226,6 +226,8 @@ def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: return 'notebooks', object_id case ObjectType.FILE: return 'files', object_id + case ObjectType.DIRECTORY: + return 'directories', object_id return None @staticmethod From 38d29b20b4a0687eae1ef4a94b9c3a5d6ca64f84 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:01:02 +0100 Subject: [PATCH 032/129] Test owner of directory --- tests/unit/framework/test_owners.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 25dd465b6f..70b741fbeb 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -3,8 +3,11 @@ from unittest.mock import create_autospec, Mock import pytest +from databricks.labs.blueprint.paths import WorkspacePath +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import iam +from databricks.sdk.service.workspace import ObjectInfo, ObjectType from databricks.labs.ucx.framework.owners import ( AccountAdministratorFinder, @@ -13,6 +16,7 @@ Ownership, Record, WorkspaceAdministratorFinder, + WorkspacePathOwnership, ) @@ -343,3 +347,19 @@ def test_ownership_no_fallback_admin_user_error() -> None: with pytest.raises(RuntimeError, match="Mocked admin lookup failure."): _ = ownership.owner_of("school") + + +def test_workspace_path_ownership_for_directory() -> None: + administrator_locator = create_autospec(AdministratorLocator) + ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.return_value = ObjectInfo(object_id=1, object_type=ObjectType.DIRECTORY) + can_manage_permission = iam.Permission(permission_level=iam.PermissionLevel.CAN_MANAGE) + access_control_list = [iam.AccessControlResponse(all_permissions=[can_manage_permission], user_name="cor")] + ws.permissions.get.return_value = iam.ObjectPermissions(access_control_list=access_control_list) + ownership = WorkspacePathOwnership(administrator_locator, ws) + + owner = ownership.owner_of(WorkspacePath(ws, "/some/directory")) + + assert owner == "cor" + administrator_locator.get_workspace_administrator.assert_not_called() + ws.permissions.get.assert_called_with("directories", "1") From 51b061d681a39db365a9995e840225ce710e7cf5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:09:59 +0100 Subject: [PATCH 033/129] Test retrieving ownership for invalid path --- tests/integration/framework/test_owners.py | 5 +++++ tests/unit/framework/test_owners.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index f04d8a384a..f86d34238e 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -92,3 +92,8 @@ def test_home_directory_owner_is_me(runtime_ctx) -> None: name = runtime_ctx.workspace_path_ownership.owner_of_path(home) assert name == me.user_name + + +def test_workspace_path_owner_of_invalid_path(runtime_ctx) -> None: + owner = runtime_ctx.workspace_path_ownership.owner_of_path("invalid/path") + assert owner is None diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 70b741fbeb..de657fccd1 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -5,7 +5,7 @@ import pytest from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound +from databricks.sdk.errors import InvalidParameterValue, NotFound from databricks.sdk.service import iam from databricks.sdk.service.workspace import ObjectInfo, ObjectType @@ -349,6 +349,20 @@ def test_ownership_no_fallback_admin_user_error() -> None: _ = ownership.owner_of("school") +def test_workspace_path_ownership_for_invalid_path() -> None: + administrator_locator = create_autospec(AdministratorLocator) + administrator_locator.get_workspace_administrator.return_value = "Admin" + ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.side_effect = InvalidParameterValue("Invalid path") + ownership = WorkspacePathOwnership(administrator_locator, ws) + + owner = ownership.owner_of(WorkspacePath(ws, "invalid/path/misses/leading/backslash")) + + assert owner == "Admin" + administrator_locator.get_workspace_administrator.assert_called_once() + ws.permissions.get.assert_not_called() + + def test_workspace_path_ownership_for_directory() -> None: administrator_locator = create_autospec(AdministratorLocator) ws = create_autospec(WorkspaceClient) From c7c57a6769809c7383f9b37dcd0b8983442be8da Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:10:09 +0100 Subject: [PATCH 034/129] Handle invalid path --- src/databricks/labs/ucx/framework/owners.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index d8efd6fdb9..a6b6f1ebc5 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -7,7 +7,7 @@ from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound, InternalError +from databricks.sdk.errors import InternalError, InvalidParameterValue, NotFound from databricks.sdk.retries import retried from databricks.sdk.service.iam import User, PermissionLevel from databricks.sdk.service.workspace import ObjectType @@ -219,7 +219,11 @@ def _maybe_direct_owner(self, record: WorkspacePath) -> str | None: @staticmethod def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: - object_info = path._object_info # pylint: disable=protected-access + try: + object_info = path._object_info # pylint: disable=protected-access + except InvalidParameterValue: + logger.warning(f"Cannot retrieve status for: {path}") + return None object_id = str(object_info.object_id) match object_info.object_type: case ObjectType.NOTEBOOK: From b91f1351ff6c0cbb73fce9b14aebfb6510d27a46 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:11:01 +0100 Subject: [PATCH 035/129] Add missing type hints --- src/databricks/labs/ucx/framework/owners.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index a6b6f1ebc5..15cb8f2bcc 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -9,7 +9,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors import InternalError, InvalidParameterValue, NotFound from databricks.sdk.retries import retried -from databricks.sdk.service.iam import User, PermissionLevel +from databricks.sdk.service.iam import User, ObjectPermissions, PermissionLevel from databricks.sdk.service.workspace import ObjectType logger = logging.getLogger(__name__) @@ -235,7 +235,7 @@ def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: return None @staticmethod - def _infer_from_first_can_manage(object_permissions): + def _infer_from_first_can_manage(object_permissions: ObjectPermissions) -> str: for acl in object_permissions.access_control_list: for permission in acl.all_permissions: if permission.permission_level != PermissionLevel.CAN_MANAGE: From 67a1007e3058973156ce7892dc395baa2c14135d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:17:27 +0100 Subject: [PATCH 036/129] Test warn about unsupported object type --- tests/unit/framework/test_owners.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index de657fccd1..e5337765ec 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -1,3 +1,4 @@ +import logging import re from collections.abc import Callable, Sequence from unittest.mock import create_autospec, Mock @@ -363,6 +364,22 @@ def test_workspace_path_ownership_for_invalid_path() -> None: ws.permissions.get.assert_not_called() +def test_workspace_path_ownership_warns_about_unsupported_object_type(caplog) -> None: + administrator_locator = create_autospec(AdministratorLocator) + administrator_locator.get_workspace_administrator.return_value = "Admin" + ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.return_value = ObjectInfo(object_id=1, object_type=ObjectType.REPO) + ownership = WorkspacePathOwnership(administrator_locator, ws) + + with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.framework.owners"): + owner = ownership.owner_of(WorkspacePath(ws, "/Workspace/Repose/repo")) + + assert owner == "Admin" + assert "Unsupported object type: REPO" in caplog.messages + administrator_locator.get_workspace_administrator.assert_called_once() + ws.permissions.get.assert_not_called() + + def test_workspace_path_ownership_for_directory() -> None: administrator_locator = create_autospec(AdministratorLocator) ws = create_autospec(WorkspaceClient) From 6f2b27209cdc745f50021247e54e2ca345a723e2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:17:37 +0100 Subject: [PATCH 037/129] Warn about unsupported object type --- src/databricks/labs/ucx/framework/owners.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 15cb8f2bcc..c599c1ff0d 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -224,6 +224,8 @@ def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: except InvalidParameterValue: logger.warning(f"Cannot retrieve status for: {path}") return None + if not (object_info.object_id and object_info.object_type): + return None object_id = str(object_info.object_id) match object_info.object_type: case ObjectType.NOTEBOOK: @@ -232,6 +234,8 @@ def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: return 'files', object_id case ObjectType.DIRECTORY: return 'directories', object_id + case _: + logger.warning(f"Unsupported object type: {object_info.object_type.value}") return None @staticmethod From bf7a9fb6372464a2a642414ce3ab3c2ce3ac5b89 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:19:43 +0100 Subject: [PATCH 038/129] Test Lakeview dashboard ownership is me --- tests/integration/assessment/test_dashboards.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index cc711abe23..86e6d41fef 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -76,3 +76,14 @@ def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: me = runtime_ctx.workspace_client.current_user.me() assert owner == me.display_name + + +def test_lakeview_dashboard_ownership_is_me(runtime_ctx, make_lakeview_dashboard) -> None: + """Lakeview dashboard do not have a `creator` field, thus we fall back on the parent workspace path owner""" + sdk_lakeview_dashboard = make_lakeview_dashboard() + dashboard = Dashboard.from_sdk_lakeview_dashboard(sdk_lakeview_dashboard) + + owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) + + me = runtime_ctx.workspace_client.current_user.me() + assert owner == me.user_name From 95e2229caf8481d3af9cba341b48a9d6b8c326f2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:20:01 +0100 Subject: [PATCH 039/129] Test getting user name --- src/databricks/labs/ucx/assessment/dashboards.py | 2 +- tests/integration/assessment/test_dashboards.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/dashboards.py b/src/databricks/labs/ucx/assessment/dashboards.py index b429fd2961..9ceb0c5bd1 100644 --- a/src/databricks/labs/ucx/assessment/dashboards.py +++ b/src/databricks/labs/ucx/assessment/dashboards.py @@ -417,7 +417,7 @@ def _maybe_direct_owner(self, record: Dashboard) -> str | None: def _get_user_name(self, user_id: str) -> str | None: try: user = self._ws.users.get(user_id) - return user.display_name or user.user_name + return user.user_name except DatabricksError as e: logger.warning(f"Could not retrieve user: {user_id}", exc_info=e) return None diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 86e6d41fef..76ee3fbe96 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -75,7 +75,7 @@ def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) me = runtime_ctx.workspace_client.current_user.me() - assert owner == me.display_name + assert owner == me.user_name def test_lakeview_dashboard_ownership_is_me(runtime_ctx, make_lakeview_dashboard) -> None: From 2659aecaca0c228d0ee812d701e0dda567560316 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:23:07 +0100 Subject: [PATCH 040/129] Handle resource does not exists --- src/databricks/labs/ucx/framework/owners.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index c599c1ff0d..202e2d9d67 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -7,7 +7,7 @@ from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import InternalError, InvalidParameterValue, NotFound +from databricks.sdk.errors import InternalError, InvalidParameterValue, NotFound, ResourceDoesNotExist from databricks.sdk.retries import retried from databricks.sdk.service.iam import User, ObjectPermissions, PermissionLevel from databricks.sdk.service.workspace import ObjectType @@ -221,7 +221,7 @@ def _maybe_direct_owner(self, record: WorkspacePath) -> str | None: def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: try: object_info = path._object_info # pylint: disable=protected-access - except InvalidParameterValue: + except (InvalidParameterValue, ResourceDoesNotExist): logger.warning(f"Cannot retrieve status for: {path}") return None if not (object_info.object_id and object_info.object_type): From d0a59159f9dc636880258d2c6f789f8e1ebf053b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:25:49 +0100 Subject: [PATCH 041/129] Handle None types --- src/databricks/labs/ucx/framework/owners.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 202e2d9d67..e874c58da1 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -239,8 +239,12 @@ def _maybe_type_and_id(path: WorkspacePath) -> tuple[str, str] | None: return None @staticmethod - def _infer_from_first_can_manage(object_permissions: ObjectPermissions) -> str: + def _infer_from_first_can_manage(object_permissions: ObjectPermissions) -> str | None: + if object_permissions.access_control_list is None: + return None for acl in object_permissions.access_control_list: + if acl.all_permissions is None: + return None for permission in acl.all_permissions: if permission.permission_level != PermissionLevel.CAN_MANAGE: continue From 3ab57ade08d875a633924d7e98b3472f7a70e70b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:27:11 +0100 Subject: [PATCH 042/129] Remove legacy test --- tests/unit/assessment/test_dashboards.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/unit/assessment/test_dashboards.py b/tests/unit/assessment/test_dashboards.py index b9ffa948d6..6926875073 100644 --- a/tests/unit/assessment/test_dashboards.py +++ b/tests/unit/assessment/test_dashboards.py @@ -617,21 +617,6 @@ def test_lakeview_dashboard_crawler_list_queries_handles_not_found(caplog, mock_ ws.lakeview.get.assert_called_once_with("did") -def test_dashboard_ownership_owner_of_from_user_display_name() -> None: - administrator_locator = create_autospec(AdministratorLocator) - ws = create_autospec(WorkspaceClient) - ws.users.get.return_value = User(display_name="Cor") - workspace_path_ownership = create_autospec(WorkspacePathOwnership) - ownership = DashboardOwnership(administrator_locator, ws, workspace_path_ownership) - - owner = ownership.owner_of(Dashboard("id", creator_id="123456789")) - - assert owner == "Cor" - administrator_locator.get_workspace_administrator.assert_not_called() - ws.users.get.assert_called_with("123456789") - workspace_path_ownership.owner_of_path.assert_not_called() - - def test_dashboard_ownership_owner_of_from_user_email() -> None: administrator_locator = create_autospec(AdministratorLocator) ws = create_autospec(WorkspaceClient) From 3a1502461648010b1cbade1ac1d373d13b6d84d2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:29:58 +0100 Subject: [PATCH 043/129] Update unit test --- tests/unit/progress/test_dashboards.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/progress/test_dashboards.py b/tests/unit/progress/test_dashboards.py index 5c17001afc..55da64faba 100644 --- a/tests/unit/progress/test_dashboards.py +++ b/tests/unit/progress/test_dashboards.py @@ -54,7 +54,7 @@ object_type="Dashboard", object_id=["did4"], data={"id": "did4", "query_ids": "[]", "tags": "[]"}, - failures=["Used by TABLE: hive_metastore.schema.table"], + failures=["Pending migration: hive_metastore.schema.table"], owner="cor", ucx_version=ucx_version, ), @@ -101,7 +101,7 @@ def test_dashboard_progress_encoder(expected: Row) -> None: "object_type": "TABLE", "table_format": "DELTA", }, - failures=["Used by TABLE: hive_metastore.schema.table"], + failures=["Pending migration"], owner="cor", ucx_version=ucx_version, ) From 9d85f2fbcd3e8fc2404dafe17559cddbd10110c6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 11:31:20 +0100 Subject: [PATCH 044/129] Rename me to current_user --- tests/integration/assessment/test_dashboards.py | 8 ++++---- tests/integration/framework/test_owners.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 76ee3fbe96..83ed85714e 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -74,8 +74,8 @@ def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) - me = runtime_ctx.workspace_client.current_user.me() - assert owner == me.user_name + current_user = runtime_ctx.workspace_client.current_user.me() + assert owner == current_user.user_name def test_lakeview_dashboard_ownership_is_me(runtime_ctx, make_lakeview_dashboard) -> None: @@ -85,5 +85,5 @@ def test_lakeview_dashboard_ownership_is_me(runtime_ctx, make_lakeview_dashboard owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) - me = runtime_ctx.workspace_client.current_user.me() - assert owner == me.user_name + current_user = runtime_ctx.workspace_client.current_user.me() + assert owner == current_user.user_name diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index f86d34238e..9367861bc8 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -86,12 +86,12 @@ def test_file_owner(make_workspace_file, ws): def test_home_directory_owner_is_me(runtime_ctx) -> None: - me = runtime_ctx.workspace_client.current_user.me() - home = f"/Users/{me.user_name}" + current_user = runtime_ctx.workspace_client.current_user.me() + home = f"/Users/{current_user.user_name}" name = runtime_ctx.workspace_path_ownership.owner_of_path(home) - assert name == me.user_name + assert name == current_user.user_name def test_workspace_path_owner_of_invalid_path(runtime_ctx) -> None: From b6a6a51c21ddf7052cea2e11e9831ae2e75f22e0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 20 Dec 2024 11:13:41 +0100 Subject: [PATCH 045/129] The owner of an invalid path should fallback on the workspace admin --- tests/integration/framework/test_owners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index 9367861bc8..c8fa0b6fdf 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -96,4 +96,4 @@ def test_home_directory_owner_is_me(runtime_ctx) -> None: def test_workspace_path_owner_of_invalid_path(runtime_ctx) -> None: owner = runtime_ctx.workspace_path_ownership.owner_of_path("invalid/path") - assert owner is None + assert owner == runtime_ctx.administrator_locator.get_workspace_administrator() From 6b4829f39bd01f9c0bb49b8d55d494a49a9051da Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 20 Dec 2024 11:17:40 +0100 Subject: [PATCH 046/129] Improve assert message --- tests/integration/assessment/test_dashboards.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 83ed85714e..ce194c4ea5 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -75,7 +75,7 @@ def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: owner = runtime_ctx.dashboard_ownership.owner_of(dashboard) current_user = runtime_ctx.workspace_client.current_user.me() - assert owner == current_user.user_name + assert owner == current_user.user_name, f"Invalid owner for dashboard: {dashboard}" def test_lakeview_dashboard_ownership_is_me(runtime_ctx, make_lakeview_dashboard) -> None: From d82de722a0d63e7c4938ed40fef78cf8c3721f46 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 20 Dec 2024 13:35:16 +0100 Subject: [PATCH 047/129] Skip test when running in debug --- tests/integration/assessment/test_dashboards.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index ce194c4ea5..166391fcc1 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -68,7 +68,9 @@ def test_lakeview_dashboard_crawler_crawls_dashboard( assert dashboards == [Dashboard.from_sdk_lakeview_dashboard(dashboard)] -def test_redash_dashboard_ownership_is_me(runtime_ctx) -> None: +def test_redash_dashboard_ownership_is_me(runtime_ctx, is_in_debug) -> None: + """The Redash owner should be the user that creates the dashboard, i.e. who runs this integration test.""" + _ = is_in_debug # The user cannot be found using the Dashboard creator user ID when running this test from the CI sdk_redash_dashboard = runtime_ctx.make_dashboard() dashboard = Dashboard.from_sdk_redash_dashboard(sdk_redash_dashboard) From 093845fa3931418d53a2ed1c71d9c75db8d55581 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 15 Nov 2024 13:18:44 +0100 Subject: [PATCH 048/129] Copy changes from #3112 https://github.com/databrickslabs/ucx/pull/3112 --- ...> 01_00_percentage_migration_progress.sql} | 2 +- ..._01_percentage_udf_migration_progress.sql} | 0 ...2_percentage_grant_migration_progress.sql} | 0 ..._03_percentage_job_migration_progress.sql} | 0 ...percentage_cluster_migration_progress.sql} | 0 ...5_percentage_table_migration_progress.sql} | 2 +- .../01_06_percentage_used_table_progress.sql | 5 +++ .../01_07_count_direct_filesystem_access.sql | 7 +++ .../main/01_08_count_query_problem.sql | 6 +++ ...ercentage_pipeline_migration_progress.sql} | 0 ..._percentage_policy_migration_progress.sql} | 0 ..._11_distinct_failures_per_object_type.sql} | 2 +- ...2_4_migration_status_by_owner_overview.sql | 2 +- .../ucx/queries/progress/main/03_00_code.md | 8 ++++ ...ending_migration_data_asset_references.sql | 4 ++ ...ta_asset_references_by_owner_bar_graph.sql | 24 +++++++++++ .../03_03_migrated_data_asset_references.sql | 4 ++ ..._references_pending_migration_overview.sql | 20 +++++++++ ...ata_asset_references_pending_migration.sql | 43 +++++++++++++++++++ .../main/03_06_code_compatibility_issues.sql | 29 +++++++++++++ 20 files changed, 154 insertions(+), 4 deletions(-) rename src/databricks/labs/ucx/queries/progress/main/{01_0_percentage_migration_progress.sql => 01_00_percentage_migration_progress.sql} (55%) rename src/databricks/labs/ucx/queries/progress/main/{01_1_percentage_udf_migration_progress.sql => 01_01_percentage_udf_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_2_percentage_grant_migration_progress.sql => 01_02_percentage_grant_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_3_percentage_job_migration_progress.sql => 01_03_percentage_job_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_4_percentage_cluster_migration_progress.sql => 01_04_percentage_cluster_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_5_percentage_table_migration_progress.sql => 01_05_percentage_table_migration_progress.sql} (75%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql rename src/databricks/labs/ucx/queries/progress/main/{01_6_percentage_pipeline_migration_progress.sql => 01_09_percentage_pipeline_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_7_percentage_policy_migration_progress.sql => 01_10_percentage_policy_migration_progress.sql} (100%) rename src/databricks/labs/ucx/queries/progress/main/{01_8_distinct_failures_per_object_type.sql => 01_11_distinct_failures_per_object_type.sql} (67%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_00_code.md create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql similarity index 55% rename from src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql index d429af42cc..d5ca534978 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_0_percentage_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql @@ -2,4 +2,4 @@ SELECT ROUND(100 * try_divide(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') +WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') diff --git a/src/databricks/labs/ucx/queries/progress/main/01_1_percentage_udf_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_01_percentage_udf_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_1_percentage_udf_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_01_percentage_udf_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_2_percentage_grant_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_02_percentage_grant_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_2_percentage_grant_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_02_percentage_grant_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_3_percentage_job_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_03_percentage_job_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_3_percentage_job_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_03_percentage_job_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_4_percentage_cluster_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_04_percentage_cluster_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_4_percentage_cluster_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_04_percentage_cluster_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql similarity index 75% rename from src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql index b3a2e4554e..120dbab112 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_5_percentage_table_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_05_percentage_table_migration_progress.sql @@ -1,4 +1,4 @@ -/* --title 'Table migration progress (%)' --width 2 */ +/* --title 'Table migration progress (%)' */ SELECT ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot diff --git a/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql new file mode 100644 index 0000000000..544062edc2 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql @@ -0,0 +1,5 @@ +/* --title '"Table references in code" progress (%)' --description 'Tables referring UC over Hive metastore' */ +SELECT + ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "UsedTable" diff --git a/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql b/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql new file mode 100644 index 0000000000..2a79b7d902 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql @@ -0,0 +1,7 @@ +/* --title 'Direct filesystem access progress (#)' --description 'Unsupported in Unity Catalog' */ +SELECT COUNT(*) AS counter +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "DirectFsAccess" + -- Redundant filter as a direct filesystem access is a failure by definition (see description above), + -- however, filter is defined for explicitness and as this knowledge is not "known" to this query. + AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql b/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql new file mode 100644 index 0000000000..a70028dc6b --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql @@ -0,0 +1,6 @@ +/* --title 'Query problem progress (#)' */ +SELECT COUNT(*) AS counter +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "QueryProblem" + -- Redundant filter as a query problem is a failure by definition, however, filter is defined for explicitness + AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_6_percentage_pipeline_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_6_percentage_pipeline_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_7_percentage_policy_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_7_percentage_policy_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql b/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql similarity index 67% rename from src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql rename to src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql index 00a229d02f..75cb3bcaf6 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_8_distinct_failures_per_object_type.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql @@ -2,7 +2,7 @@ with failures AS ( SELECT object_type, explode(failures) AS failure FROM ucx_catalog.multiworkspace.objects_snapshot - WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') + WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') ) SELECT diff --git a/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql b/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql index a9d7a7591f..c4ff69b267 100644 --- a/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql +++ b/src/databricks/labs/ucx/queries/progress/main/02_4_migration_status_by_owner_overview.sql @@ -1,6 +1,6 @@ /* --title 'Overview' --description 'Tables and views migration' --width 5 */ WITH migration_statuses AS ( - SELECT * + SELECT owner, failures FROM ucx_catalog.multiworkspace.objects_snapshot WHERE object_type = 'Table' ) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_00_code.md b/src/databricks/labs/ucx/queries/progress/main/03_00_code.md new file mode 100644 index 0000000000..ca3fd81e2a --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_00_code.md @@ -0,0 +1,8 @@ +# Code + +This section shows Unity Catalog compatability issues found while linting code. There are two kinds of code changes to +perform: +- Data asset reference, like references to Hive metastore tables and views or direct filesystem access (dfsa). These + references should be updated to refer to their Unity Catalog counterparts. +- Linting compatability issues, like using RDDs or directly accessing the Spark context. These issues should be resolved + by following the instructions stated with the issue. diff --git a/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql new file mode 100644 index 0000000000..d6388b41b3 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql @@ -0,0 +1,4 @@ +/* --title 'Pending migration' --description 'Total number of table, view and dfsa references' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql b/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql new file mode 100644 index 0000000000..3910fc0b06 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql @@ -0,0 +1,24 @@ +/* +--title 'Pending migration' +--description 'Tables, views and dfsa per owner' +--width 5 +--overrides '{"spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": {"fieldName": "owner", "scale": {"type": "categorical"}, "displayName": "owner"}, + "y": {"fieldName": "count", "scale": {"type": "quantitative"}, "displayName": "count"} + } +}}' +*/ +WITH owners_with_failures AS ( + SELECT owner + FROM ucx_catalog.multiworkspace.objects_snapshot + WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 +) + +SELECT + owner, + COUNT(1) AS count +FROM owners_with_failures +GROUP BY owner diff --git a/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql new file mode 100644 index 0000000000..689e2bfaf0 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql @@ -0,0 +1,4 @@ +/* --title 'Migrated' --description 'Total number of table, view and dfsa references' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) == 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql b/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql new file mode 100644 index 0000000000..1b14d7185b --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql @@ -0,0 +1,20 @@ +/* --title 'Overview' --description 'Table, view and dfsa migration' --width 5 */ +WITH migration_statuses AS ( + SELECT owner, object_type, failures + FROM ucx_catalog.multiworkspace.objects_snapshot + WHERE object_type IN ('DirectFsAccess', 'UsedTable') +) + +SELECT + owner, + CASE + WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' + WHEN object_type = 'UsedTable' THEN 'Table or view reference' + ELSE object_type + END AS object_type, + DOUBLE(CEIL(100 * COUNT_IF(SIZE(failures) = 0) / SUM(COUNT(*)) OVER (PARTITION BY owner, object_type), 2)) AS percentage, + COUNT(*) AS total, + COUNT_IF(SIZE(failures) = 0) AS total_migrated, + COUNT_IF(SIZE(failures) > 0) AS total_not_migrated +FROM migration_statuses +GROUP BY owner, object_type diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql new file mode 100644 index 0000000000..0db6a1bc8c --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql @@ -0,0 +1,43 @@ +/* +--title 'Data asset references' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, + {"fieldName": "object_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "object_type"}, + {"fieldName": "object_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "object_id"}, + {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "failure"}, + {"fieldName": "is_read", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_read"}, + {"fieldName": "is_write", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_write"} + ]}, + "invisibleColumns": [ + {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"} + ] + }}' +*/ +SELECT + workspace_id, + owner, + CASE + WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' + WHEN object_type = 'UsedTable' THEN 'Table or view reference' + ELSE object_type + END AS object_type, + CASE + WHEN object_type = 'DirectFsAccess' THEN data.path + WHEN object_type = 'UsedTable' THEN CONCAT_WS('.', object_id) + ELSE CONCAT_WS('.', object_id) + END AS object_id, + EXPLODE(failures) AS failure, + CAST(data.is_read AS BOOLEAN) AS is_read, + CAST(data.is_write AS BOOLEAN) AS is_write, + -- Below are invisible column(s) used in links url templates + CASE + -- SQL queries do NOT point to the workspace, i.e. start with '/' + WHEN object_type = 'DirectFsAccess' AND SUBSTRING(data.source_id, 0, 1) != '/' THEN CONCAT('/sql/editor/', data.source_id) + ELSE CONCAT('/#workspace', data.source_id) + END AS link +FROM ucx_catalog.multiworkspace.objects_snapshot +ORDER BY workspace_id, owner, object_type, object_id +WHERE object_type IN ('DirectFsAccess', 'UsedTable') diff --git a/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql b/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql new file mode 100644 index 0000000000..1c623da8fd --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql @@ -0,0 +1,29 @@ +/* +--title 'Code compatability issues' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, + {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, + {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, + {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard"}, + {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query"} + ]}, + "invisibleColumns": [ + {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, + {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} + ] + }}' +*/ +SELECT + workspace_id, + data.code, + data.message, + data.dashboard_name, + data.query_name, + -- Below are invisible columns used in links url templates + data.dashboard_id, + data.query_id +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = 'QueryProblem' From e45b1b63c065bffd4600bb61d691a85f7449fa22 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 15 Nov 2024 13:24:28 +0100 Subject: [PATCH 049/129] Add integration test --- .../queries/test_migration_progress.py | 283 +++++++++++++++++- 1 file changed, 273 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index f3596a6777..10391c7936 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -16,7 +16,10 @@ from databricks.labs.ucx.hive_metastore.udfs import Udf from databricks.labs.ucx.progress.install import ProgressTrackingInstallation from databricks.labs.ucx.progress.workflow_runs import WorkflowRun +from databricks.labs.ucx.source_code.base import DirectFsAccess, LineageAtom from databricks.labs.ucx.source_code.jobs import JobProblem +from databricks.labs.ucx.source_code.queries import QueryProblem +from databricks.labs.ucx.source_code.used_table import UsedTable from ..conftest import MockRuntimeContext @@ -188,6 +191,110 @@ def policies() -> list[PolicyInfo]: return records +@pytest.fixture +def query_problems(make_dashboard, make_query) -> list[QueryProblem]: + dashboard, query = make_dashboard(), make_query() + records = [ + QueryProblem( + dashboard.id, + dashboard.parent, + dashboard.name, + query.id, + query.parent, + query.name, + "sql-parse-error", + "Could not parse SQL", + ) + ] + return records + + +@pytest.fixture +def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: + workspace_file = make_workspace_file(content='df = spark.read.csv("dbfs://folder/file.csv")') + query = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") + records = [ + DirectFsAccess( + path="dbfs://folder/file.csv", + is_read=False, + # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=str(workspace_file), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id=str(workspace_file)), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + DirectFsAccess( + path="dbfs://folder/file.csv", + is_read=False, + # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=query.id, + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), + LineageAtom(object_type="QUERY", object_id=f"my_dashboard_id/{query.id}", other={"name": "my_query"}), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + ] + return records + + +@pytest.fixture +def used_tables(make_workspace_file, make_table) -> list[UsedTable]: + table = make_table(catalog_name="hive_metastore") + workspace_file = make_workspace_file(content=f'df = spark.read.table("{table.full_name}")\ndisplay(df)') + records = [ + UsedTable( + catalog_name=table.catalog_name, # This table is pending migration + schema_name=table.schema_name, + table_name=table.name, + is_read=False, + # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=str(workspace_file), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id=str(workspace_file)), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + UsedTable( + catalog_name="catalog", # This table is migrated + schema_name="staff_db", + table_name="employees", + is_read=False, + is_write=True, + source_id=str(make_workspace_file()), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id="my file_path"), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), + ] + return records + + @pytest.fixture def catalog_populated( # pylint: disable=too-many-arguments runtime_ctx: MockRuntimeContext, @@ -201,6 +308,9 @@ def catalog_populated( # pylint: disable=too-many-arguments clusters: list[ClusterInfo], pipelines: list[PipelineInfo], policies: list[PolicyInfo], + query_problems: list[QueryProblem], + dfsas: list[DirectFsAccess], + used_tables: list[UsedTable], ): """Populate the UCX catalog with multiworkspace tables. @@ -235,6 +345,13 @@ def catalog_populated( # pylint: disable=too-many-arguments Grant, mode='overwrite', ) + # Persist UsedTable to match when looking for UsedTable ownership to tables + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.used_tables_in_paths', + used_tables, + UsedTable, + mode='overwrite', + ) for parent_run_id in range(1, 3): # No changes in progress between the two runs runtime_ctx = runtime_ctx.replace(parent_run_id=parent_run_id) runtime_ctx.tables_progress.append_inventory_snapshot(tables) @@ -252,6 +369,12 @@ def catalog_populated( # pylint: disable=too-many-arguments del runtime_ctx.pipelines_progress runtime_ctx.policies_progress.append_inventory_snapshot(policies) del runtime_ctx.policies_progress + runtime_ctx.query_problem_progress.append_inventory_snapshot(query_problems) + del runtime_ctx.query_problem_progress + runtime_ctx.direct_filesystem_access_progress.append_inventory_snapshot(dfsas) + del runtime_ctx.direct_filesystem_access_progress + runtime_ctx.used_table_progress.append_inventory_snapshot(used_tables) + del runtime_ctx.used_table_progress return runtime_ctx.ucx_catalog @@ -290,22 +413,30 @@ def test_migration_progress_dashboard( @pytest.mark.parametrize( "query_name, rows", [ - ("01_0_percentage_migration_progress", [Row(percentage=round(100 * 22 / 34, 2))]), - ("01_1_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_2_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), - ("01_3_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), - ("01_4_percentage_cluster_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_5_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), - ("01_6_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_7_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 39, 2))]), + ("01_01_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_02_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), + ("01_03_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), + ("01_04_percentage_cluster_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_05_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), + ("01_06_percentage_used_table_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_07_count_direct_filesystem_access", [Row(counter=2)]), + ("01_08_count_query_problem", [Row(counter=1)]), + ("01_09_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_10_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ( - "01_8_distinct_failures_per_object_type", + "01_11_distinct_failures_per_object_type", [ Row( object_type="ClusterInfo", count=1, failure="Uses azure service principal credentials config in cluster", ), + Row( + object_type="DirectFsAccess", + count=2, + failure="Direct filesystem access is not supported in Unity Catalog", + ), Row( object_type="Grant", count=1, @@ -327,8 +458,10 @@ def test_migration_progress_dashboard( count=1, failure="Uses azure service principal credentials config in policy", ), + Row(object_type="QueryProblem", count=1, failure="[sql-parse-error] Could not parse SQL"), Row(object_type="Table", count=5, failure="Pending migration"), Row(object_type="Udf", count=1, failure="UDF not supported by UC"), + Row(object_type="UsedTable", count=1, failure="Pending migration"), ], ), ( @@ -351,9 +484,21 @@ def test_migration_progress_dashboard( Row(owner="Eric", percentage=round(100 * 1 / 1, 2), total=1, total_migrated=1, total_not_migrated=0), ], ), + ( + "03_01_pending_migration_data_asset_references", + [ + Row(count=3), + ], + ), + ( + "03_03_migrated_data_asset_references", + [ + Row(count=1), + ], + ), ], ) -def test_percentage_migration_progress( +def test_migration_progress_query( dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, query_name, @@ -363,3 +508,121 @@ def test_percentage_migration_progress( assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) assert query_results == rows + + +def test_migration_progress_query_data_asset_references_by_owner_bar_graph( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, +) -> None: + """Separate test is required to set the owner of the used table at runtime""" + query_name = "03_02_data_asset_references_by_owner_bar_graph" + rows = [Row(owner=ws.current_user.me().user_name, count=1)] + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_query_data_asset_references_pending_migration_overview( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, +) -> None: + """Separate test is required to set the owner of the used table at runtime""" + query_name = "03_04_data_asset_references_pending_migration_overview" + current_user = ws.current_user.me().user_name + rows = [ + Row( + owner=current_user, + object_type="Direct filesystem access", + percentage=0, + total=2, + total_migrated=0, + total_not_migrated=2, + ), + Row( + owner=current_user, + object_type="Table or view reference", + percentage=50, + total=2, + total_migrated=1, + total_not_migrated=1, + ), + ] + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_query_data_asset_references_pending_migration( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, + dfsas: list[DirectFsAccess], + used_tables: list[UsedTable], +) -> None: + """Separate test is required to set the dfsas and used table dynamically""" + query_name = "03_05_data_asset_references_pending_migration" + workspace_id = ws.get_workspace_id() + current_user = ws.current_user.me().user_name + rows = [] + for dfsa in dfsas: + link_prefix = "/sql/editor/" if dfsa.source_type == "QUERY" else "/#workspace" + row = Row( + workspace_id=workspace_id, + owner=current_user, + object_type="Direct filesystem access", + object_id=dfsas[0].path, + failure="Direct filesystem access is not supported in Unity Catalog", + is_read=False, + is_write=True, + link=f"{link_prefix}{dfsa.source_id}", + ) + rows.append(row) + for used_table in used_tables: + if used_table.catalog_name != "hive_metastore": + continue + row = Row( + workspace_id=workspace_id, + owner=current_user, + object_type="Table or view reference", + object_id=f"{used_table.catalog_name}.{used_table.schema_name}.{used_table.table_name}", + failure="Pending migration", + is_read=False, + is_write=True, + link=f"/#workspace{used_table.source_id}", + ) + rows.append(row) + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows + + +def test_migration_progress_code_compatibility_issues( + ws: WorkspaceClient, + dashboard_metadata: DashboardMetadata, + sql_backend: SqlBackend, + query_problems: list[QueryProblem], +) -> None: + """Separate test is required to set the dashboard and query id dynamically""" + query_name = "03_06_code_compatibility_issues" + workspace_id = ws.get_workspace_id() + rows = [] + for query_problem in query_problems: + row = Row( + workspace_id=workspace_id, + code="sql-parse-error", + message="Could not parse SQL", + dashboard_name=query_problem.dashboard_name, + query_name=query_problem.query_name, + dashboard_id=query_problem.dashboard_id, + query_id=query_problem.query_id, + ) + rows.append(row) + datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] + assert len(datasets) == 1, f"Missing query: {query_name}" + query_results = list(sql_backend.fetch(datasets[0].query)) + assert query_results == rows From 91672a3c724b9ce46827bf65a306b47d5f832903 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:23:15 +0100 Subject: [PATCH 050/129] Add dashboard fixture --- tests/integration/queries/test_migration_progress.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 10391c7936..0f696b7fec 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -8,6 +8,7 @@ from databricks.labs.lsql.dashboards import DashboardMetadata, Dashboards from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo +from databricks.labs.ucx.assessment.dashboards import Dashboard from databricks.labs.ucx.assessment.jobs import JobInfo from databricks.labs.ucx.assessment.pipelines import PipelineInfo from databricks.labs.ucx.hive_metastore.grants import Grant @@ -191,6 +192,14 @@ def policies() -> list[PolicyInfo]: return records +@pytest.fixture +def dashboards(make_dashboard) -> list[Dashboard]: + dashboards = [ + Dashboard.from_sdk_redash_dashboard(make_dashboard()), + ] + return dashboards + + @pytest.fixture def query_problems(make_dashboard, make_query) -> list[QueryProblem]: dashboard, query = make_dashboard(), make_query() From 474c48582713a1553cb39e72311c45d7560c7418 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:24:13 +0100 Subject: [PATCH 051/129] Append dashboard inventory snapshot --- tests/integration/queries/test_migration_progress.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 0f696b7fec..22b3484064 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -320,6 +320,7 @@ def catalog_populated( # pylint: disable=too-many-arguments query_problems: list[QueryProblem], dfsas: list[DirectFsAccess], used_tables: list[UsedTable], + dashboards: list[Dashboard] ): """Populate the UCX catalog with multiworkspace tables. @@ -384,6 +385,8 @@ def catalog_populated( # pylint: disable=too-many-arguments del runtime_ctx.direct_filesystem_access_progress runtime_ctx.used_table_progress.append_inventory_snapshot(used_tables) del runtime_ctx.used_table_progress + runtime_ctx.dashboards_progress.append_inventory_snapshot(dashboards) + del runtime_ctx.dashboards_progress return runtime_ctx.ucx_catalog From 412daeba508448e43ac99470a418acb4657c6440 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:25:14 +0100 Subject: [PATCH 052/129] Revert storing UsedTable, QueryProblem and DFSA snapshots --- tests/integration/queries/test_migration_progress.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 22b3484064..73cda603c0 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -317,8 +317,6 @@ def catalog_populated( # pylint: disable=too-many-arguments clusters: list[ClusterInfo], pipelines: list[PipelineInfo], policies: list[PolicyInfo], - query_problems: list[QueryProblem], - dfsas: list[DirectFsAccess], used_tables: list[UsedTable], dashboards: list[Dashboard] ): @@ -379,12 +377,6 @@ def catalog_populated( # pylint: disable=too-many-arguments del runtime_ctx.pipelines_progress runtime_ctx.policies_progress.append_inventory_snapshot(policies) del runtime_ctx.policies_progress - runtime_ctx.query_problem_progress.append_inventory_snapshot(query_problems) - del runtime_ctx.query_problem_progress - runtime_ctx.direct_filesystem_access_progress.append_inventory_snapshot(dfsas) - del runtime_ctx.direct_filesystem_access_progress - runtime_ctx.used_table_progress.append_inventory_snapshot(used_tables) - del runtime_ctx.used_table_progress runtime_ctx.dashboards_progress.append_inventory_snapshot(dashboards) del runtime_ctx.dashboards_progress return runtime_ctx.ucx_catalog From 4a074fd822b42c74cffc531c22bc0e2b5e3735b5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:29:32 +0100 Subject: [PATCH 053/129] Store QueryProblem into table --- tests/integration/queries/test_migration_progress.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 73cda603c0..7014402c20 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -318,6 +318,7 @@ def catalog_populated( # pylint: disable=too-many-arguments pipelines: list[PipelineInfo], policies: list[PolicyInfo], used_tables: list[UsedTable], + query_problems: list[QueryProblem], dashboards: list[Dashboard] ): """Populate the UCX catalog with multiworkspace tables. @@ -360,6 +361,13 @@ def catalog_populated( # pylint: disable=too-many-arguments UsedTable, mode='overwrite', ) + # Persists QueryProblems to propagate them to Dashboards + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.query_problems', + query_problems, + QueryProblem, + mode='overwrite', + ) for parent_run_id in range(1, 3): # No changes in progress between the two runs runtime_ctx = runtime_ctx.replace(parent_run_id=parent_run_id) runtime_ctx.tables_progress.append_inventory_snapshot(tables) From 93123d8c2d2d0d0255128e6fbc8373ece9d2b4b2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:35:27 +0100 Subject: [PATCH 054/129] Add TODO --- tests/integration/queries/test_migration_progress.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 7014402c20..001d04e784 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -220,6 +220,7 @@ def query_problems(make_dashboard, make_query) -> list[QueryProblem]: @pytest.fixture def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: + # TODO: Match the DFSAs with a job and dashboard workspace_file = make_workspace_file(content='df = spark.read.csv("dbfs://folder/file.csv")') query = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") records = [ From 397287603fd713d372954eabb3ad4432f215a087 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:52:24 +0100 Subject: [PATCH 055/129] Remove DFSA, QueryProblem and UsedTable queries --- .../main/01_00_percentage_migration_progress.sql | 2 +- ...6_percentage_pipeline_migration_progress.sql} | 0 .../01_06_percentage_used_table_progress.sql | 5 ----- .../01_07_count_direct_filesystem_access.sql | 7 ------- ..._07_percentage_policy_migration_progress.sql} | 0 .../progress/main/01_08_count_query_problem.sql | 6 ------ ... 01_08_distinct_failures_per_object_type.sql} | 2 +- .../queries/test_migration_progress.py | 16 +++------------- 8 files changed, 5 insertions(+), 33 deletions(-) rename src/databricks/labs/ucx/queries/progress/main/{01_09_percentage_pipeline_migration_progress.sql => 01_06_percentage_pipeline_migration_progress.sql} (100%) delete mode 100644 src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql delete mode 100644 src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql rename src/databricks/labs/ucx/queries/progress/main/{01_10_percentage_policy_migration_progress.sql => 01_07_percentage_policy_migration_progress.sql} (100%) delete mode 100644 src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql rename src/databricks/labs/ucx/queries/progress/main/{01_11_distinct_failures_per_object_type.sql => 01_08_distinct_failures_per_object_type.sql} (67%) diff --git a/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql index d5ca534978..d429af42cc 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql @@ -2,4 +2,4 @@ SELECT ROUND(100 * try_divide(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') +WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') diff --git a/src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_pipeline_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_09_percentage_pipeline_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_06_percentage_pipeline_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql deleted file mode 100644 index 544062edc2..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/01_06_percentage_used_table_progress.sql +++ /dev/null @@ -1,5 +0,0 @@ -/* --title '"Table references in code" progress (%)' --description 'Tables referring UC over Hive metastore' */ -SELECT - ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type = "UsedTable" diff --git a/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql b/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql deleted file mode 100644 index 2a79b7d902..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/01_07_count_direct_filesystem_access.sql +++ /dev/null @@ -1,7 +0,0 @@ -/* --title 'Direct filesystem access progress (#)' --description 'Unsupported in Unity Catalog' */ -SELECT COUNT(*) AS counter -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type = "DirectFsAccess" - -- Redundant filter as a direct filesystem access is a failure by definition (see description above), - -- however, filter is defined for explicitness and as this knowledge is not "known" to this query. - AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_07_percentage_policy_migration_progress.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_10_percentage_policy_migration_progress.sql rename to src/databricks/labs/ucx/queries/progress/main/01_07_percentage_policy_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql b/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql deleted file mode 100644 index a70028dc6b..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/01_08_count_query_problem.sql +++ /dev/null @@ -1,6 +0,0 @@ -/* --title 'Query problem progress (#)' */ -SELECT COUNT(*) AS counter -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type = "QueryProblem" - -- Redundant filter as a query problem is a failure by definition, however, filter is defined for explicitness - AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql b/src/databricks/labs/ucx/queries/progress/main/01_08_distinct_failures_per_object_type.sql similarity index 67% rename from src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql rename to src/databricks/labs/ucx/queries/progress/main/01_08_distinct_failures_per_object_type.sql index 75cb3bcaf6..00a229d02f 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_11_distinct_failures_per_object_type.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_08_distinct_failures_per_object_type.sql @@ -2,7 +2,7 @@ with failures AS ( SELECT object_type, explode(failures) AS failure FROM ucx_catalog.multiworkspace.objects_snapshot - WHERE object_type IN ('ClusterInfo', 'DirectFsAccess', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'QueryProblem', 'Table', 'Udf', 'UsedTable') + WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') ) SELECT diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 001d04e784..1136cb332d 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -432,24 +432,16 @@ def test_migration_progress_dashboard( ("01_03_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), ("01_04_percentage_cluster_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_05_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), - ("01_06_percentage_used_table_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_07_count_direct_filesystem_access", [Row(counter=2)]), - ("01_08_count_query_problem", [Row(counter=1)]), - ("01_09_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), - ("01_10_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_06_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_07_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ( - "01_11_distinct_failures_per_object_type", + "01_08_distinct_failures_per_object_type", [ Row( object_type="ClusterInfo", count=1, failure="Uses azure service principal credentials config in cluster", ), - Row( - object_type="DirectFsAccess", - count=2, - failure="Direct filesystem access is not supported in Unity Catalog", - ), Row( object_type="Grant", count=1, @@ -471,10 +463,8 @@ def test_migration_progress_dashboard( count=1, failure="Uses azure service principal credentials config in policy", ), - Row(object_type="QueryProblem", count=1, failure="[sql-parse-error] Could not parse SQL"), Row(object_type="Table", count=5, failure="Pending migration"), Row(object_type="Udf", count=1, failure="UDF not supported by UC"), - Row(object_type="UsedTable", count=1, failure="Pending migration"), ], ), ( From af2211fc18e0814ed01eb69753608231ec7cca4e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:56:14 +0100 Subject: [PATCH 056/129] Rename code section to Dashboards --- .../labs/ucx/queries/progress/main/03_00_code.md | 8 -------- .../labs/ucx/queries/progress/main/03_00_dashboards.md | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) delete mode 100644 src/databricks/labs/ucx/queries/progress/main/03_00_code.md create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_00_dashboards.md diff --git a/src/databricks/labs/ucx/queries/progress/main/03_00_code.md b/src/databricks/labs/ucx/queries/progress/main/03_00_code.md deleted file mode 100644 index ca3fd81e2a..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/03_00_code.md +++ /dev/null @@ -1,8 +0,0 @@ -# Code - -This section shows Unity Catalog compatability issues found while linting code. There are two kinds of code changes to -perform: -- Data asset reference, like references to Hive metastore tables and views or direct filesystem access (dfsa). These - references should be updated to refer to their Unity Catalog counterparts. -- Linting compatability issues, like using RDDs or directly accessing the Spark context. These issues should be resolved - by following the instructions stated with the issue. diff --git a/src/databricks/labs/ucx/queries/progress/main/03_00_dashboards.md b/src/databricks/labs/ucx/queries/progress/main/03_00_dashboards.md new file mode 100644 index 0000000000..4aa7f5a8fd --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_00_dashboards.md @@ -0,0 +1,8 @@ +# Dashboards + +This section shows Unity Catalog compatability issues found while linting dashboards. There are two kinds of changes to +perform: +- Data asset reference, i.e. references to Hive metastore tables and views or direct filesystem access (dfsa), these + references should be updated to refer to their Unity Catalog counterparts. +- Linting compatability issues, e.g. using RDDs or directly accessing the Spark context, these issues should be resolved + by following the instructions stated with the issue. From 11b21387ec5a916998c23e0099d1773779195d6c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 13:58:00 +0100 Subject: [PATCH 057/129] Update data asset to dashboard references --- .../03_01_dashboards_pending_migration.sql | 4 ++ ...ending_migration_data_asset_references.sql | 4 -- ..._pending_migration_by_owner_bar_graph.sql} | 5 +-- .../main/03_03_dashboards_migrated.sql | 4 ++ .../03_03_migrated_data_asset_references.sql | 4 -- ...s_pending_migration_by_owner_overview.sql} | 11 ++--- .../03_05_dashboards_pending_migration.sql | 35 +++++++++++++++ ...ata_asset_references_pending_migration.sql | 43 ------------------- .../queries/test_migration_progress.py | 8 ++-- 9 files changed, 52 insertions(+), 66 deletions(-) create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_01_dashboards_pending_migration.sql delete mode 100644 src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql rename src/databricks/labs/ucx/queries/progress/main/{03_02_data_asset_references_by_owner_bar_graph.sql => 03_02_dashboards_pending_migration_by_owner_bar_graph.sql} (75%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_03_dashboards_migrated.sql delete mode 100644 src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql rename src/databricks/labs/ucx/queries/progress/main/{03_04_data_asset_references_pending_migration_overview.sql => 03_04_dashboards_pending_migration_by_owner_overview.sql} (53%) create mode 100644 src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql delete mode 100644 src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/03_01_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_01_dashboards_pending_migration.sql new file mode 100644 index 0000000000..72b55cc12a --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_01_dashboards_pending_migration.sql @@ -0,0 +1,4 @@ +/* --title 'Dashboards pending migration' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = 'Dashboard' AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql deleted file mode 100644 index d6388b41b3..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/03_01_pending_migration_data_asset_references.sql +++ /dev/null @@ -1,4 +0,0 @@ -/* --title 'Pending migration' --description 'Total number of table, view and dfsa references' --height 6 */ -SELECT COUNT(*) AS count -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql b/src/databricks/labs/ucx/queries/progress/main/03_02_dashboards_pending_migration_by_owner_bar_graph.sql similarity index 75% rename from src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql rename to src/databricks/labs/ucx/queries/progress/main/03_02_dashboards_pending_migration_by_owner_bar_graph.sql index 3910fc0b06..03e73529ca 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_02_data_asset_references_by_owner_bar_graph.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_02_dashboards_pending_migration_by_owner_bar_graph.sql @@ -1,6 +1,5 @@ /* ---title 'Pending migration' ---description 'Tables, views and dfsa per owner' +--title 'Dashboards pending migration' --width 5 --overrides '{"spec": { "version": 3, @@ -14,7 +13,7 @@ WITH owners_with_failures AS ( SELECT owner FROM ucx_catalog.multiworkspace.objects_snapshot - WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) > 0 + WHERE object_type = 'Dashboard' AND SIZE(failures) > 0 ) SELECT diff --git a/src/databricks/labs/ucx/queries/progress/main/03_03_dashboards_migrated.sql b/src/databricks/labs/ucx/queries/progress/main/03_03_dashboards_migrated.sql new file mode 100644 index 0000000000..f6ff935179 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_03_dashboards_migrated.sql @@ -0,0 +1,4 @@ +/* --title 'Dashboards migrated' --height 6 */ +SELECT COUNT(*) AS count +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = 'Dashboard' AND SIZE(failures) == 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql b/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql deleted file mode 100644 index 689e2bfaf0..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/03_03_migrated_data_asset_references.sql +++ /dev/null @@ -1,4 +0,0 @@ -/* --title 'Migrated' --description 'Total number of table, view and dfsa references' --height 6 */ -SELECT COUNT(*) AS count -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('DirectFsAccess', 'UsedTable') AND SIZE(failures) == 0 diff --git a/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql b/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql similarity index 53% rename from src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql rename to src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql index 1b14d7185b..82b4e0ba9e 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_04_data_asset_references_pending_migration_overview.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql @@ -1,20 +1,15 @@ -/* --title 'Overview' --description 'Table, view and dfsa migration' --width 5 */ +/* --title 'Dashboard pending migration' --width 5 */ WITH migration_statuses AS ( SELECT owner, object_type, failures FROM ucx_catalog.multiworkspace.objects_snapshot - WHERE object_type IN ('DirectFsAccess', 'UsedTable') + WHERE object_type = 'Dashboard' ) SELECT owner, - CASE - WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' - WHEN object_type = 'UsedTable' THEN 'Table or view reference' - ELSE object_type - END AS object_type, DOUBLE(CEIL(100 * COUNT_IF(SIZE(failures) = 0) / SUM(COUNT(*)) OVER (PARTITION BY owner, object_type), 2)) AS percentage, COUNT(*) AS total, COUNT_IF(SIZE(failures) = 0) AS total_migrated, COUNT_IF(SIZE(failures) > 0) AS total_not_migrated FROM migration_statuses -GROUP BY owner, object_type +GROUP BY owner diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql new file mode 100644 index 0000000000..9350d121b5 --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -0,0 +1,35 @@ +/* +--title 'Dashboards pending migration' +--width 6 +--overrides '{"spec":{ + "encodings":{ + "columns": [ + {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, + {"fieldName": "owner", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "owner"}, + {"fieldName": "name", "title": "Name", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ dashboard_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, + {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "failure"}, + {"fieldName": "dashboard_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "dashboard_type"} + ]}, + "invisibleColumns": [ + {"fieldName": "dashboard_link", "title": "dashboard_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} + ] + }}' +*/ +SELECT + workspace_id, + owner, + data.name AS name, + EXPLODE(failures) AS failure, + CASE + -- Simple heuristic to differentiate between Redash and Lakeview dashboards + WHEN '-' IN data.id THEN 'Redash' + ELSE 'Lakeview' + END AS dashboard_type, + -- Below are invisible column(s) used in links url templates + CASE + WHEN '-' IN data.id THEN CONCAT('/sql/dashboards/', data.id) + ELSE CONCAT('/dashboardsv3/', data.id, '/published') + END AS dashboard_link +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = 'Dashboard' AND SIZE(failures) > 0 +ORDER BY workspace_id, owner, name diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql deleted file mode 100644 index 0db6a1bc8c..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_data_asset_references_pending_migration.sql +++ /dev/null @@ -1,43 +0,0 @@ -/* ---title 'Data asset references' ---width 6 ---overrides '{"spec":{ - "encodings":{ - "columns": [ - {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, - {"fieldName": "object_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "object_type"}, - {"fieldName": "object_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "object_id"}, - {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "failure"}, - {"fieldName": "is_read", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_read"}, - {"fieldName": "is_write", "booleanValues": ["false", "true"], "type": "integer", "displayAs": "number", "title": "is_write"} - ]}, - "invisibleColumns": [ - {"name": "link", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "link"} - ] - }}' -*/ -SELECT - workspace_id, - owner, - CASE - WHEN object_type = 'DirectFsAccess' THEN 'Direct filesystem access' - WHEN object_type = 'UsedTable' THEN 'Table or view reference' - ELSE object_type - END AS object_type, - CASE - WHEN object_type = 'DirectFsAccess' THEN data.path - WHEN object_type = 'UsedTable' THEN CONCAT_WS('.', object_id) - ELSE CONCAT_WS('.', object_id) - END AS object_id, - EXPLODE(failures) AS failure, - CAST(data.is_read AS BOOLEAN) AS is_read, - CAST(data.is_write AS BOOLEAN) AS is_write, - -- Below are invisible column(s) used in links url templates - CASE - -- SQL queries do NOT point to the workspace, i.e. start with '/' - WHEN object_type = 'DirectFsAccess' AND SUBSTRING(data.source_id, 0, 1) != '/' THEN CONCAT('/sql/editor/', data.source_id) - ELSE CONCAT('/#workspace', data.source_id) - END AS link -FROM ucx_catalog.multiworkspace.objects_snapshot -ORDER BY workspace_id, owner, object_type, object_id -WHERE object_type IN ('DirectFsAccess', 'UsedTable') diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 1136cb332d..59da98a7e9 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -320,7 +320,7 @@ def catalog_populated( # pylint: disable=too-many-arguments policies: list[PolicyInfo], used_tables: list[UsedTable], query_problems: list[QueryProblem], - dashboards: list[Dashboard] + dashboards: list[Dashboard], ): """Populate the UCX catalog with multiworkspace tables. @@ -488,13 +488,13 @@ def test_migration_progress_dashboard( ], ), ( - "03_01_pending_migration_data_asset_references", + "03_01_dashboards_pending_migration", [ Row(count=3), ], ), ( - "03_03_migrated_data_asset_references", + "03_03_dashboards_migrated", [ Row(count=1), ], @@ -519,7 +519,7 @@ def test_migration_progress_query_data_asset_references_by_owner_bar_graph( sql_backend: SqlBackend, ) -> None: """Separate test is required to set the owner of the used table at runtime""" - query_name = "03_02_data_asset_references_by_owner_bar_graph" + query_name = "03_02_dashboards_pending_migration_by_owner_bar_graph" rows = [Row(owner=ws.current_user.me().user_name, count=1)] datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] assert len(datasets) == 1, f"Missing query: {query_name}" From 8e625ff67fc4b857c136a253b8436c2a2838dca7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Dec 2024 14:17:13 +0100 Subject: [PATCH 058/129] Avoid redefinition --- tests/integration/queries/test_migration_progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 59da98a7e9..c8bab14a74 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -194,10 +194,10 @@ def policies() -> list[PolicyInfo]: @pytest.fixture def dashboards(make_dashboard) -> list[Dashboard]: - dashboards = [ + records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard()), ] - return dashboards + return records @pytest.fixture From 018c3cdd099dbfed9098669c24dc6020cd069a68 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:17:12 +0100 Subject: [PATCH 059/129] Fix partition by --- .../03_04_dashboards_pending_migration_by_owner_overview.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql b/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql index 82b4e0ba9e..58e4f013e6 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_04_dashboards_pending_migration_by_owner_overview.sql @@ -1,13 +1,13 @@ /* --title 'Dashboard pending migration' --width 5 */ WITH migration_statuses AS ( - SELECT owner, object_type, failures + SELECT owner, failures FROM ucx_catalog.multiworkspace.objects_snapshot WHERE object_type = 'Dashboard' ) SELECT owner, - DOUBLE(CEIL(100 * COUNT_IF(SIZE(failures) = 0) / SUM(COUNT(*)) OVER (PARTITION BY owner, object_type), 2)) AS percentage, + DOUBLE(CEIL(100 * COUNT_IF(SIZE(failures) = 0) / SUM(COUNT(*)) OVER (PARTITION BY owner), 2)) AS percentage, COUNT(*) AS total, COUNT_IF(SIZE(failures) = 0) AS total_migrated, COUNT_IF(SIZE(failures) > 0) AS total_not_migrated From 1d915f21678915790d1eda47ee8c2907724dff62 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:20:31 +0100 Subject: [PATCH 060/129] Fix check substring --- .../progress/main/03_05_dashboards_pending_migration.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql index 9350d121b5..c2ae9ad3cf 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -22,12 +22,12 @@ SELECT EXPLODE(failures) AS failure, CASE -- Simple heuristic to differentiate between Redash and Lakeview dashboards - WHEN '-' IN data.id THEN 'Redash' + WHEN CONTAINS('-', data.id) THEN 'Redash' ELSE 'Lakeview' END AS dashboard_type, -- Below are invisible column(s) used in links url templates CASE - WHEN '-' IN data.id THEN CONCAT('/sql/dashboards/', data.id) + WHEN CONTAINS('-', data.id) THEN CONCAT('/sql/dashboards/', data.id) ELSE CONCAT('/dashboardsv3/', data.id, '/published') END AS dashboard_link FROM ucx_catalog.multiworkspace.objects_snapshot From 9dd40b5302e6b5a372870a35c0a49958889648fe Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:21:43 +0100 Subject: [PATCH 061/129] Fix name for code compatability issues --- ...tibility_issues.sql => 03_06_query_compatibility_issues.sql} | 2 +- tests/integration/queries/test_migration_progress.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename src/databricks/labs/ucx/queries/progress/main/{03_06_code_compatibility_issues.sql => 03_06_query_compatibility_issues.sql} (98%) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql b/src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql similarity index 98% rename from src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql rename to src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql index 1c623da8fd..76628b6ab5 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_06_code_compatibility_issues.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql @@ -1,5 +1,5 @@ /* ---title 'Code compatability issues' +--title 'Query compatability issues' --width 6 --overrides '{"spec":{ "encodings":{ diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index c8bab14a74..2781c08a00 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -611,7 +611,7 @@ def test_migration_progress_code_compatibility_issues( query_problems: list[QueryProblem], ) -> None: """Separate test is required to set the dashboard and query id dynamically""" - query_name = "03_06_code_compatibility_issues" + query_name = "03_06_query_compatibility_issues" workspace_id = ws.get_workspace_id() rows = [] for query_problem in query_problems: From 480ad8b8d5fe848cebf0cbbf2500509f4ccda11c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:23:39 +0100 Subject: [PATCH 062/129] Create query for dashboard --- tests/integration/queries/test_migration_progress.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 2781c08a00..565f5918b9 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -193,9 +193,10 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards(make_dashboard) -> list[Dashboard]: +def dashboards(make_dashboard, make_query) -> list[Dashboard]: + query = make_query() records = [ - Dashboard.from_sdk_redash_dashboard(make_dashboard()), + Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query)), ] return records From 02ae46935cc89d08062e26710d2fbc612f81b392 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:25:30 +0100 Subject: [PATCH 063/129] Link query problems with dashboard --- .../queries/test_migration_progress.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 565f5918b9..f4f9bbe4ab 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -194,24 +194,26 @@ def policies() -> list[PolicyInfo]: @pytest.fixture def dashboards(make_dashboard, make_query) -> list[Dashboard]: - query = make_query() + query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") records = [ - Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query)), + Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), ] return records @pytest.fixture -def query_problems(make_dashboard, make_query) -> list[QueryProblem]: - dashboard, query = make_dashboard(), make_query() +def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[QueryProblem]: + assert len(dashboards) == 1, "This fixtures expects one dashboard" + dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] + query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) records = [ QueryProblem( - dashboard.id, - dashboard.parent, - dashboard.name, - query.id, - query.parent, - query.name, + dashboard_with_invalid_sql.id, + dashboard_with_invalid_sql.parent, + dashboard_with_invalid_sql.name, + query_with_invalid_sql.id, + query_with_invalid_sql.parent_path, + query_with_invalid_sql.display_name, "sql-parse-error", "Could not parse SQL", ) From 77442e2f6b67a1fe9507b8714e116183aa4ce500 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:34:03 +0100 Subject: [PATCH 064/129] Swap arguments in contains --- .../progress/main/03_05_dashboards_pending_migration.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql index c2ae9ad3cf..2d3a84a57e 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -22,12 +22,12 @@ SELECT EXPLODE(failures) AS failure, CASE -- Simple heuristic to differentiate between Redash and Lakeview dashboards - WHEN CONTAINS('-', data.id) THEN 'Redash' + WHEN CONTAINS(data.id, '-') THEN 'Redash' ELSE 'Lakeview' END AS dashboard_type, -- Below are invisible column(s) used in links url templates CASE - WHEN CONTAINS('-', data.id) THEN CONCAT('/sql/dashboards/', data.id) + WHEN CONTAINS(data.id, '-') THEN CONCAT('/sql/dashboards/', data.id) ELSE CONCAT('/dashboardsv3/', data.id, '/published') END AS dashboard_link FROM ucx_catalog.multiworkspace.objects_snapshot From a899f338985d2d577fc02fff0aa0baba6f8bae39 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:35:41 +0100 Subject: [PATCH 065/129] Swap failure and dashboard_type columns --- .../progress/main/03_05_dashboards_pending_migration.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql index 2d3a84a57e..e22c63e876 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -19,12 +19,12 @@ SELECT workspace_id, owner, data.name AS name, - EXPLODE(failures) AS failure, CASE -- Simple heuristic to differentiate between Redash and Lakeview dashboards WHEN CONTAINS(data.id, '-') THEN 'Redash' ELSE 'Lakeview' END AS dashboard_type, + EXPLODE(failures) AS failure, -- Below are invisible column(s) used in links url templates CASE WHEN CONTAINS(data.id, '-') THEN CONCAT('/sql/dashboards/', data.id) From bfc4cc305fa0b5afe05bb6fabc08e9a92437cc75 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:36:54 +0100 Subject: [PATCH 066/129] Remove query problems widget --- .../main/03_06_query_compatibility_issues.sql | 29 ------------------- .../queries/test_migration_progress.py | 27 ----------------- 2 files changed, 56 deletions(-) delete mode 100644 src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql b/src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql deleted file mode 100644 index 76628b6ab5..0000000000 --- a/src/databricks/labs/ucx/queries/progress/main/03_06_query_compatibility_issues.sql +++ /dev/null @@ -1,29 +0,0 @@ -/* ---title 'Query compatability issues' ---width 6 ---overrides '{"spec":{ - "encodings":{ - "columns": [ - {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, - {"fieldName": "code", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "code"}, - {"fieldName": "message", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "message"}, - {"fieldName": "dashboard_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/dashboards/{{ dashboard_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard"}, - {"fieldName": "query_name", "booleanValues": ["false", "true"], "linkUrlTemplate": "/sql/editor/{{ query_id }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query"} - ]}, - "invisibleColumns": [ - {"name": "dashboard_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "dashboard_id"}, - {"name": "query_id", "booleanValues": ["false", "true"], "linkUrlTemplate": "{{ @ }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "type": "string", "displayAs": "link", "title": "query_id"} - ] - }}' -*/ -SELECT - workspace_id, - data.code, - data.message, - data.dashboard_name, - data.query_name, - -- Below are invisible columns used in links url templates - data.dashboard_id, - data.query_id -FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type = 'QueryProblem' diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index f4f9bbe4ab..369e0df213 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -605,30 +605,3 @@ def test_migration_progress_query_data_asset_references_pending_migration( assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) assert query_results == rows - - -def test_migration_progress_code_compatibility_issues( - ws: WorkspaceClient, - dashboard_metadata: DashboardMetadata, - sql_backend: SqlBackend, - query_problems: list[QueryProblem], -) -> None: - """Separate test is required to set the dashboard and query id dynamically""" - query_name = "03_06_query_compatibility_issues" - workspace_id = ws.get_workspace_id() - rows = [] - for query_problem in query_problems: - row = Row( - workspace_id=workspace_id, - code="sql-parse-error", - message="Could not parse SQL", - dashboard_name=query_problem.dashboard_name, - query_name=query_problem.query_name, - dashboard_id=query_problem.dashboard_id, - query_id=query_problem.query_id, - ) - rows.append(row) - datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] - assert len(datasets) == 1, f"Missing query: {query_name}" - query_results = list(sql_backend.fetch(datasets[0].query)) - assert query_results == rows From dbae8a8e514bd2c37b24660da64ebc42d7a33f00 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:37:55 +0100 Subject: [PATCH 067/129] Add dashboard with dfsa --- tests/integration/queries/test_migration_progress.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 369e0df213..a1bf6c9620 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -195,8 +195,10 @@ def policies() -> list[PolicyInfo]: @pytest.fixture def dashboards(make_dashboard, make_query) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") + query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), + Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), ] return records From f88ab28b6a9a42ce6c31c2226c20810c020892bd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:38:18 +0100 Subject: [PATCH 068/129] Add dashboard that is correct --- tests/integration/queries/test_migration_progress.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index a1bf6c9620..0eb930c427 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -199,6 +199,7 @@ def dashboards(make_dashboard, make_query) -> list[Dashboard]: records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), + Dashboard.from_sdk_redash_dashboard(make_dashboard()), # Correct dashboard ] return records From 40e028d9ac7876fa20319ff8b60927ae9952e860 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:41:47 +0100 Subject: [PATCH 069/129] Add query problem with dfsa --- .../queries/test_migration_progress.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 0eb930c427..6ffcf4996c 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -206,9 +206,11 @@ def dashboards(make_dashboard, make_query) -> list[Dashboard]: @pytest.fixture def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[QueryProblem]: - assert len(dashboards) == 1, "This fixtures expects one dashboard" + assert len(dashboards) == 3, "This fixtures expects three dashboards" dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) + dashboard_with_dfsa, query_id_dfsa = dashboards[1], dashboards[1].query_ids[0] + query_with_dfsa = ws.queries.get(query_id_with_invalid_sql) records = [ QueryProblem( dashboard_with_invalid_sql.id, @@ -219,7 +221,17 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que query_with_invalid_sql.display_name, "sql-parse-error", "Could not parse SQL", - ) + ), + QueryProblem( + dashboard_with_dfsa.id, + dashboard_with_dfsa.parent, + dashboard_with_dfsa.name, + query_with_dfsa.id, + query_with_dfsa.parent_path, + query_with_dfsa.display_name, + "direct-filesystem-access-in-sql-query" + "The use of direct filesystem references is deprecated: dbfs://folder/file.csv" + ), ] return records From ace2dcc22573b15830a77f52b3970f17cf842017 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:45:08 +0100 Subject: [PATCH 070/129] Add dashboard with Hive table --- tests/integration/queries/test_migration_progress.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 6ffcf4996c..ffd898fb09 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -193,13 +193,15 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards(make_dashboard, make_query) -> list[Dashboard]: +def dashboards(make_dashboard, make_query, tables: list[Table]) -> list[Dashboard]: + assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") + query_with_hive_table = make_query(sql_query="SELECT * FROM {tables[0].full_name}") records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), - Dashboard.from_sdk_redash_dashboard(make_dashboard()), # Correct dashboard + Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)), ] return records From 451c7a799548e1f70cff7e8e643d7addf2854295 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:47:14 +0100 Subject: [PATCH 071/129] Reuse tables in used tables --- tests/integration/queries/test_migration_progress.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index ffd898fb09..2344911bb1 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -281,13 +281,14 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: @pytest.fixture -def used_tables(make_workspace_file, make_table) -> list[UsedTable]: - table = make_table(catalog_name="hive_metastore") +def used_tables(make_workspace_file, tables: list[Table]) -> list[UsedTable]: + assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" + table = tables[0] workspace_file = make_workspace_file(content=f'df = spark.read.table("{table.full_name}")\ndisplay(df)') records = [ UsedTable( - catalog_name=table.catalog_name, # This table is pending migration - schema_name=table.schema_name, + catalog_name=table.catalog, # This table is pending migration + schema_name=table.database, table_name=table.name, is_read=False, # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to From 0f65ee6d43b19acea04beb54da2d257655b21aab Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:50:39 +0100 Subject: [PATCH 072/129] Add used table for query --- .../queries/test_migration_progress.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 2344911bb1..fb1f6b5cb4 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -281,9 +281,11 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: @pytest.fixture -def used_tables(make_workspace_file, tables: list[Table]) -> list[UsedTable]: +def used_tables(ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], tables: list[Table]) -> list[UsedTable]: + assert len(dashboards) == 3, "Expecting three dashboards" assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" - table = tables[0] + dashboard, table = dashboards[0], tables[0] + query = ws.queries.get(dashboard.query_ids[0]) workspace_file = make_workspace_file(content=f'df = spark.read.table("{table.full_name}")\ndisplay(df)') records = [ UsedTable( @@ -305,6 +307,23 @@ def used_tables(make_workspace_file, tables: list[Table]) -> list[UsedTable]: assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ), + UsedTable( + catalog_name=table.catalog, # This table is pending migration + schema_name=table.database, + table_name=table.name, + is_read=False, + # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=query.id, + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id=dashboard.id, other={"name": dashboard.name}), + LineageAtom(object_type="QUERY", object_id=f"{dashboard.id}/{query.id}", other={"name": query.display_name}), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ), UsedTable( catalog_name="catalog", # This table is migrated schema_name="staff_db", From 2c5a658f2986643e98b82ec90ef779588985a881 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 09:52:42 +0100 Subject: [PATCH 073/129] Persist used tables in queries --- tests/integration/queries/test_migration_progress.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index fb1f6b5cb4..39039f15b2 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -402,6 +402,13 @@ def catalog_populated( # pylint: disable=too-many-arguments UsedTable, mode='overwrite', ) + # Persist UsedTable to match with dashboard queries + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.used_tables_in_queries', + used_tables, + UsedTable, + mode='overwrite', + ) # Persists QueryProblems to propagate them to Dashboards runtime_ctx.sql_backend.save_table( f'hive_metastore.{runtime_ctx.inventory_database}.query_problems', From 6f35290eb78825609cdd189e2471e8fc300716e7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:17:59 +0100 Subject: [PATCH 074/129] Fix missing comma in query problem --- tests/integration/queries/test_migration_progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 39039f15b2..c37c4dd09c 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -231,8 +231,8 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que query_with_dfsa.id, query_with_dfsa.parent_path, query_with_dfsa.display_name, - "direct-filesystem-access-in-sql-query" - "The use of direct filesystem references is deprecated: dbfs://folder/file.csv" + "direct-filesystem-access-in-sql-query", + "The use of direct filesystem references is deprecated: dbfs://folder/file.csv", ), ] return records From b40f100fca76ec0e27691ed802ccc17fd08b99c8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:18:49 +0100 Subject: [PATCH 075/129] Fix reference wrong query id --- tests/integration/queries/test_migration_progress.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index c37c4dd09c..a661bae448 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -211,8 +211,8 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que assert len(dashboards) == 3, "This fixtures expects three dashboards" dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) - dashboard_with_dfsa, query_id_dfsa = dashboards[1], dashboards[1].query_ids[0] - query_with_dfsa = ws.queries.get(query_id_with_invalid_sql) + dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] + query_with_dfsa = ws.queries.get(query_id_with_dfsa) records = [ QueryProblem( dashboard_with_invalid_sql.id, @@ -281,7 +281,9 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: @pytest.fixture -def used_tables(ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], tables: list[Table]) -> list[UsedTable]: +def used_tables( + ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], tables: list[Table] +) -> list[UsedTable]: assert len(dashboards) == 3, "Expecting three dashboards" assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" dashboard, table = dashboards[0], tables[0] @@ -319,7 +321,9 @@ def used_tables(ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashb source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ LineageAtom(object_type="DASHBOARD", object_id=dashboard.id, other={"name": dashboard.name}), - LineageAtom(object_type="QUERY", object_id=f"{dashboard.id}/{query.id}", other={"name": query.display_name}), + LineageAtom( + object_type="QUERY", object_id=f"{dashboard.id}/{query.id}", other={"name": query.display_name} + ), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), From 001d8254dcf80fabd50ee6c9ce026094d17222a7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:25:36 +0100 Subject: [PATCH 076/129] Swap columns --- .../progress/main/03_05_dashboards_pending_migration.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql index e22c63e876..bba364e1c3 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -7,8 +7,8 @@ {"fieldName": "workspace_id", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "workspace_id"}, {"fieldName": "owner", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "owner"}, {"fieldName": "name", "title": "Name", "type": "string", "displayAs": "link", "linkUrlTemplate": "{{ dashboard_link }}", "linkTextTemplate": "{{ @ }}", "linkTitleTemplate": "{{ @ }}", "linkOpenInNewTab": true, "booleanValues": ["false", "true"]}, - {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "failure"}, - {"fieldName": "dashboard_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "dashboard_type"} + {"fieldName": "dashboard_type", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "dashboard_type"}, + {"fieldName": "failure", "booleanValues": ["false", "true"], "type": "string", "displayAs": "string", "title": "failure"} ]}, "invisibleColumns": [ {"fieldName": "dashboard_link", "title": "dashboard_link", "type": "string", "displayAs": "string", "booleanValues": ["false", "true"]} From 94eb90fbbcea8218cf49c8191085973322f43512 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:40:11 +0100 Subject: [PATCH 077/129] Use non-migrated table --- .../queries/test_migration_progress.py | 50 ++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index a661bae448..1bdd00e90a 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -75,6 +75,15 @@ def table_migration_status(tables: list[Table]) -> list[TableMigrationStatus]: return records +@pytest.fixture +def table_migration_status_pending_migration( + table_migration_status: list[TableMigrationStatus], +) -> list[TableMigrationStatus]: + records = [status for status in table_migration_status if status.dst_catalog is None] + assert records, "Expecting a table pending migration" + return records + + @pytest.fixture def udfs() -> list[Udf]: records = [ @@ -193,11 +202,19 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards(make_dashboard, make_query, tables: list[Table]) -> list[Dashboard]: - assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" +def dashboards( + make_dashboard, make_query, table_migration_status_pending_migration: list[TableMigrationStatus] +) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") - query_with_hive_table = make_query(sql_query="SELECT * FROM {tables[0].full_name}") + table_full_name_pending_migration = ".".join( + [ + "hive_metastore", + table_migration_status_pending_migration[0].src_schema, + table_migration_status_pending_migration[0].src_table, + ] + ) + query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name_pending_migration}") records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), @@ -282,18 +299,25 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: @pytest.fixture def used_tables( - ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], tables: list[Table] + ws: WorkspaceClient, + make_workspace_file, + dashboards: list[Dashboard], + table_migration_status_pending_migration: list[TableMigrationStatus], ) -> list[UsedTable]: assert len(dashboards) == 3, "Expecting three dashboards" - assert "hive_metastore" == tables[0].catalog, "Expecting table to be a hive table" - dashboard, table = dashboards[0], tables[0] + dashboard, table_migration_status = dashboards[0], table_migration_status_pending_migration[0] + table_full_name_pending_migration = ".".join( + ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] + ) + workspace_file = make_workspace_file( + content=f'df = spark.read.table("{table_full_name_pending_migration}")\ndisplay(df)' + ) query = ws.queries.get(dashboard.query_ids[0]) - workspace_file = make_workspace_file(content=f'df = spark.read.table("{table.full_name}")\ndisplay(df)') records = [ UsedTable( - catalog_name=table.catalog, # This table is pending migration - schema_name=table.database, - table_name=table.name, + catalog_name="hive_metastore", + schema_name=table_migration_status.src_schema, + table_name=table_migration_status.src_table, is_read=False, # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to # the current user, which we can test below. @@ -310,9 +334,9 @@ def used_tables( assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ), UsedTable( - catalog_name=table.catalog, # This table is pending migration - schema_name=table.database, - table_name=table.name, + catalog_name="hive_metastore", + schema_name=table_migration_status.src_schema, + table_name=table_migration_status.src_table, is_read=False, # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to # the current user, which we can test below. From b1ca9752b7ce19ec1ad011903384ce54890bf85e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:42:01 +0100 Subject: [PATCH 078/129] Assert None fields --- .../queries/test_migration_progress.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 1bdd00e90a..83a5291d14 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -230,6 +230,20 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] query_with_dfsa = ws.queries.get(query_id_with_dfsa) + assert ( + dashboard_with_invalid_sql.id is not None + and dashboard_with_invalid_sql.parent is not None + and dashboard_with_invalid_sql.name is not None + and query_with_invalid_sql.id is not None + and query_with_invalid_sql.parent_path is not None + and query_with_invalid_sql.display_name is not None + and dashboard_with_dfsa.id is not None + and dashboard_with_dfsa.parent is not None + and dashboard_with_dfsa.name is not None + and query_with_dfsa.id is not None + and query_with_dfsa.parent_path is not None + and query_with_dfsa.display_name is not None + ) records = [ QueryProblem( dashboard_with_invalid_sql.id, @@ -313,6 +327,7 @@ def used_tables( content=f'df = spark.read.table("{table_full_name_pending_migration}")\ndisplay(df)' ) query = ws.queries.get(dashboard.query_ids[0]) + assert query.id is not None and query.display_name is not None and dashboard.name is not None records = [ UsedTable( catalog_name="hive_metastore", From 33e82b770e9e25c23af5125f16ccd90be22ef617 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:44:38 +0100 Subject: [PATCH 079/129] Shorten variable name --- tests/integration/queries/test_migration_progress.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 83a5291d14..359a6fecc8 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -207,14 +207,9 @@ def dashboards( ) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") - table_full_name_pending_migration = ".".join( - [ - "hive_metastore", - table_migration_status_pending_migration[0].src_schema, - table_migration_status_pending_migration[0].src_table, - ] - ) - query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name_pending_migration}") + table_migration_status = table_migration_status_pending_migration[0] + table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) + query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name}") records = [ Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), From cd0ae18db86a3b6a6df1b877087a6e1b6b9b40ee Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:48:51 +0100 Subject: [PATCH 080/129] Move workflow run into for loop --- .../queries/test_migration_progress.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 359a6fecc8..3ae17d7413 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -406,12 +406,6 @@ def catalog_populated( # pylint: disable=too-many-arguments "function" scoped, thus one should first evaluate if those can be changed. """ ProgressTrackingInstallation(runtime_ctx.sql_backend, runtime_ctx.ucx_catalog).run() - runtime_ctx.sql_backend.save_table( - f"{runtime_ctx.ucx_catalog}.multiworkspace.workflow_runs", - workflow_runs, - WorkflowRun, - mode="overwrite", - ) # Persist workflow problems to propagate failures to jobs runtime_ctx.sql_backend.save_table( f'hive_metastore.{runtime_ctx.inventory_database}.workflow_problems', @@ -454,8 +448,14 @@ def catalog_populated( # pylint: disable=too-many-arguments QueryProblem, mode='overwrite', ) - for parent_run_id in range(1, 3): # No changes in progress between the two runs - runtime_ctx = runtime_ctx.replace(parent_run_id=parent_run_id) + for workflow_run in workflow_runs: # No changes in progress between the two runs + runtime_ctx.sql_backend.save_table( + f"{runtime_ctx.ucx_catalog}.multiworkspace.workflow_runs", + [workflow_run], + WorkflowRun, + mode="append", + ) + runtime_ctx = runtime_ctx.replace(parent_run_id=workflow_run.workflow_run_id) runtime_ctx.tables_progress.append_inventory_snapshot(tables) # The deletes below reset the cached parent run ids on the encoders del runtime_ctx.tables_progress From 4377ed43ee710fae3261b9c6c91c30e374dda4a5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:49:33 +0100 Subject: [PATCH 081/129] Rename table migration statuses --- tests/integration/queries/test_migration_progress.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 3ae17d7413..473a9c20f1 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -64,7 +64,7 @@ def tables() -> list[Table]: @pytest.fixture -def table_migration_status(tables: list[Table]) -> list[TableMigrationStatus]: +def table_migration_statuses(tables: list[Table]) -> list[TableMigrationStatus]: records = [] for table in tables: if table.database == "schema1": # schema1 tables are migrated @@ -77,9 +77,9 @@ def table_migration_status(tables: list[Table]) -> list[TableMigrationStatus]: @pytest.fixture def table_migration_status_pending_migration( - table_migration_status: list[TableMigrationStatus], + table_migration_statuses, ) -> list[TableMigrationStatus]: - records = [status for status in table_migration_status if status.dst_catalog is None] + records = [status for status in table_migration_statuses if status.dst_catalog is None] assert records, "Expecting a table pending migration" return records @@ -388,7 +388,7 @@ def catalog_populated( # pylint: disable=too-many-arguments runtime_ctx: MockRuntimeContext, workflow_runs: list[WorkflowRun], tables: list[Table], - table_migration_status: list[TableMigrationStatus], + table_migration_statuses, udfs: list[Udf], grants: list[Grant], jobs: list[JobInfo], @@ -416,7 +416,7 @@ def catalog_populated( # pylint: disable=too-many-arguments # Persists table migration status to propagate which tables are pending migration runtime_ctx.sql_backend.save_table( f'hive_metastore.{runtime_ctx.inventory_database}.migration_status', - table_migration_status, + table_migration_statuses, TableMigrationStatus, mode='overwrite', ) From 2db023273b2d3dd438208736ed1e53f4546755f3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:50:21 +0100 Subject: [PATCH 082/129] Rename table migration statuses pending migration --- .../integration/queries/test_migration_progress.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 473a9c20f1..63ef8b061e 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -76,9 +76,7 @@ def table_migration_statuses(tables: list[Table]) -> list[TableMigrationStatus]: @pytest.fixture -def table_migration_status_pending_migration( - table_migration_statuses, -) -> list[TableMigrationStatus]: +def statuses_pending_migration(table_migration_statuses: list[TableMigrationStatus]) -> list[TableMigrationStatus]: records = [status for status in table_migration_statuses if status.dst_catalog is None] assert records, "Expecting a table pending migration" return records @@ -202,12 +200,10 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards( - make_dashboard, make_query, table_migration_status_pending_migration: list[TableMigrationStatus] -) -> list[Dashboard]: +def dashboards(make_dashboard, make_query, statuses_pending_migration) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") - table_migration_status = table_migration_status_pending_migration[0] + table_migration_status = statuses_pending_migration[0] table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name}") records = [ @@ -311,10 +307,10 @@ def used_tables( ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], - table_migration_status_pending_migration: list[TableMigrationStatus], + statuses_pending_migration, ) -> list[UsedTable]: assert len(dashboards) == 3, "Expecting three dashboards" - dashboard, table_migration_status = dashboards[0], table_migration_status_pending_migration[0] + dashboard, table_migration_status = dashboards[0], statuses_pending_migration[0] table_full_name_pending_migration = ".".join( ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] ) From 73fcf0ee6849cc1ff654591a840daab907e1b6e5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 10:56:51 +0100 Subject: [PATCH 083/129] Rename variable --- tests/integration/queries/test_migration_progress.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 63ef8b061e..b29061ecb0 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -311,12 +311,8 @@ def used_tables( ) -> list[UsedTable]: assert len(dashboards) == 3, "Expecting three dashboards" dashboard, table_migration_status = dashboards[0], statuses_pending_migration[0] - table_full_name_pending_migration = ".".join( - ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] - ) - workspace_file = make_workspace_file( - content=f'df = spark.read.table("{table_full_name_pending_migration}")\ndisplay(df)' - ) + table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) + workspace_file = make_workspace_file(content=f'df = spark.read.table("{table_full_name}")\ndisplay(df)') query = ws.queries.get(dashboard.query_ids[0]) assert query.id is not None and query.display_name is not None and dashboard.name is not None records = [ From 1da55ba3e41b4b09b078b59a8aaecb493c2825da Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 11:02:08 +0100 Subject: [PATCH 084/129] Verify right dashboard is chosen --- .../queries/test_migration_progress.py | 58 ++++++++++--------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index b29061ecb0..586ee73078 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -309,14 +309,17 @@ def used_tables( dashboards: list[Dashboard], statuses_pending_migration, ) -> list[UsedTable]: - assert len(dashboards) == 3, "Expecting three dashboards" - dashboard, table_migration_status = dashboards[0], statuses_pending_migration[0] - table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) - workspace_file = make_workspace_file(content=f'df = spark.read.table("{table_full_name}")\ndisplay(df)') + dashboard = dashboards[-1] query = ws.queries.get(dashboard.query_ids[0]) assert query.id is not None and query.display_name is not None and dashboard.name is not None - records = [ - UsedTable( + records = [] + for table_migration_status in statuses_pending_migration: + table_full_name = ".".join( + ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] + ) + assert table_full_name in query.query_text or "", f"Expecting table '{table_full_name} in query: {query.id}" + workspace_file = make_workspace_file(content=f'df = spark.read.table("{table_full_name}")\ndisplay(df)') + used_python_table = UsedTable( catalog_name="hive_metastore", schema_name=table_migration_status.src_schema, table_name=table_migration_status.src_table, @@ -334,8 +337,8 @@ def used_tables( ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), - ), - UsedTable( + ) + used_sql_table = UsedTable( catalog_name="hive_metastore", schema_name=table_migration_status.src_schema, table_name=table_migration_status.src_table, @@ -353,25 +356,26 @@ def used_tables( ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), - ), - UsedTable( - catalog_name="catalog", # This table is migrated - schema_name="staff_db", - table_name="employees", - is_read=False, - is_write=True, - source_id=str(make_workspace_file()), - source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), - source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), - LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), - LineageAtom(object_type="FILE", object_id="my file_path"), - ], - assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), - assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), - ), - ] + ) + records.extend([used_python_table, used_sql_table]) + used_uc_table = UsedTable( + catalog_name="catalog", # This table is migrated + schema_name="staff_db", + table_name="employees", + is_read=False, + is_write=True, + source_id=str(make_workspace_file()), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id="my file_path"), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ) + records.append(used_uc_table) return records From a1595f836ba9ec3da19642a2e01b2aa97c7803b8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 11:04:21 +0100 Subject: [PATCH 085/129] Improve asserts in fixtures --- tests/integration/queries/test_migration_progress.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 586ee73078..87ce0c70f3 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -219,8 +219,12 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que assert len(dashboards) == 3, "This fixtures expects three dashboards" dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) + assert ( + "SELECT SUM(1" in query_with_invalid_sql.query_text or "" + ), f"Expecting invalid query: {query_with_invalid_sql.id}" dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] query_with_dfsa = ws.queries.get(query_id_with_dfsa) + assert "dbfs:" in query_id_with_dfsa.query_text, f"Expecting direct filesystem access: {query_with_dfsa.id}" assert ( dashboard_with_invalid_sql.id is not None and dashboard_with_invalid_sql.parent is not None From 186e4a5dd3cc056d2601f06e6c5b7b3258ae62cc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 11:05:08 +0100 Subject: [PATCH 086/129] Fix asserts --- tests/integration/queries/test_migration_progress.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 87ce0c70f3..465427ab5f 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -219,12 +219,12 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que assert len(dashboards) == 3, "This fixtures expects three dashboards" dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) - assert ( - "SELECT SUM(1" in query_with_invalid_sql.query_text or "" + assert "SELECT SUM(1" in ( + query_with_invalid_sql.query_text or "" ), f"Expecting invalid query: {query_with_invalid_sql.id}" dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] query_with_dfsa = ws.queries.get(query_id_with_dfsa) - assert "dbfs:" in query_id_with_dfsa.query_text, f"Expecting direct filesystem access: {query_with_dfsa.id}" + assert "dbfs:" in (query_id_with_dfsa.query_text or ""), f"Expecting direct filesystem access: {query_with_dfsa.id}" assert ( dashboard_with_invalid_sql.id is not None and dashboard_with_invalid_sql.parent is not None @@ -321,7 +321,7 @@ def used_tables( table_full_name = ".".join( ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] ) - assert table_full_name in query.query_text or "", f"Expecting table '{table_full_name} in query: {query.id}" + assert table_full_name in (query.query_text or ""), f"Expecting table '{table_full_name} in query: {query.id}" workspace_file = make_workspace_file(content=f'df = spark.read.table("{table_full_name}")\ndisplay(df)') used_python_table = UsedTable( catalog_name="hive_metastore", From 329a5fb729b2ad7b274e2e20e4fcd14d641b1187 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 11:05:37 +0100 Subject: [PATCH 087/129] Add type hint --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 465427ab5f..57fd0afd46 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -311,7 +311,7 @@ def used_tables( ws: WorkspaceClient, make_workspace_file, dashboards: list[Dashboard], - statuses_pending_migration, + statuses_pending_migration: list[TableMigrationStatus], ) -> list[UsedTable]: dashboard = dashboards[-1] query = ws.queries.get(dashboard.query_ids[0]) From 29b570402e433b9936821ab704473a9774b06388 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 11:39:17 +0100 Subject: [PATCH 088/129] Fix reference to variable --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 57fd0afd46..330cc85460 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -224,7 +224,7 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que ), f"Expecting invalid query: {query_with_invalid_sql.id}" dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] query_with_dfsa = ws.queries.get(query_id_with_dfsa) - assert "dbfs:" in (query_id_with_dfsa.query_text or ""), f"Expecting direct filesystem access: {query_with_dfsa.id}" + assert "dbfs:" in (query_with_dfsa.query_text or ""), f"Expecting direct filesystem access: {query_with_dfsa.id}" assert ( dashboard_with_invalid_sql.id is not None and dashboard_with_invalid_sql.parent is not None From c4d1e72f52607fb3b26f3af9021ed5f4f90c5b46 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:13:21 +0100 Subject: [PATCH 089/129] Add type hinting --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 330cc85460..d0177a3053 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -200,7 +200,7 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards(make_dashboard, make_query, statuses_pending_migration) -> list[Dashboard]: +def dashboards(make_dashboard, make_query, statuses_pending_migration: list[TableMigrationStatus]) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") table_migration_status = statuses_pending_migration[0] From 682964549b4a6c87f7f6a6b0edc9791e9a0aa6a7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:19:28 +0100 Subject: [PATCH 090/129] Make query problems dynamic using dashboards --- .../queries/test_migration_progress.py | 79 ++++++++----------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index d0177a3053..5c3b0f1d5b 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -2,6 +2,7 @@ import webbrowser import pytest +import sqlglot from databricks.sdk import WorkspaceClient from databricks.labs.blueprint.wheels import find_project_root from databricks.labs.lsql.backends import SqlBackend, Row @@ -216,51 +217,39 @@ def dashboards(make_dashboard, make_query, statuses_pending_migration: list[Tabl @pytest.fixture def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[QueryProblem]: - assert len(dashboards) == 3, "This fixtures expects three dashboards" - dashboard_with_invalid_sql, query_id_with_invalid_sql = dashboards[0], dashboards[0].query_ids[0] - query_with_invalid_sql = ws.queries.get(query_id_with_invalid_sql) - assert "SELECT SUM(1" in ( - query_with_invalid_sql.query_text or "" - ), f"Expecting invalid query: {query_with_invalid_sql.id}" - dashboard_with_dfsa, query_id_with_dfsa = dashboards[1], dashboards[1].query_ids[0] - query_with_dfsa = ws.queries.get(query_id_with_dfsa) - assert "dbfs:" in (query_with_dfsa.query_text or ""), f"Expecting direct filesystem access: {query_with_dfsa.id}" - assert ( - dashboard_with_invalid_sql.id is not None - and dashboard_with_invalid_sql.parent is not None - and dashboard_with_invalid_sql.name is not None - and query_with_invalid_sql.id is not None - and query_with_invalid_sql.parent_path is not None - and query_with_invalid_sql.display_name is not None - and dashboard_with_dfsa.id is not None - and dashboard_with_dfsa.parent is not None - and dashboard_with_dfsa.name is not None - and query_with_dfsa.id is not None - and query_with_dfsa.parent_path is not None - and query_with_dfsa.display_name is not None - ) - records = [ - QueryProblem( - dashboard_with_invalid_sql.id, - dashboard_with_invalid_sql.parent, - dashboard_with_invalid_sql.name, - query_with_invalid_sql.id, - query_with_invalid_sql.parent_path, - query_with_invalid_sql.display_name, - "sql-parse-error", - "Could not parse SQL", - ), - QueryProblem( - dashboard_with_dfsa.id, - dashboard_with_dfsa.parent, - dashboard_with_dfsa.name, - query_with_dfsa.id, - query_with_dfsa.parent_path, - query_with_dfsa.display_name, - "direct-filesystem-access-in-sql-query", - "The use of direct filesystem references is deprecated: dbfs://folder/file.csv", - ), - ] + records = [] + for dashboard in dashboards: + if len(dashboard.query_ids) == 0: + continue + query = ws.queries.get(dashboard.query_ids[0]) + if query.id is None or query.query_text is None: + continue + try: + sqlglot.parse_one(query.query_text, dialect="databricks") + except sqlglot.ParseError: + query_problem = QueryProblem( + dashboard.id, + dashboard.parent, + dashboard.name, + query.id, + query.parent_path, + query.display_name, + "sql-parse-error", + "Could not parse SQL", + ) + records.append(query_problem) + if "dbfs://" in query.query_text: + query_problem = QueryProblem( + dashboard.id, + dashboard.parent, + dashboard.name, + query.id, + query.parent_path, + query.display_name, + "direct-filesystem-access-in-sql-query", + "The use of direct filesystem references is deprecated: dbfs://...", + ) + records.append(query_problem) return records From a415ee7db64dfc76c55be82faa38503159699636 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:22:30 +0100 Subject: [PATCH 091/129] Separate dashboard with Hive table out --- .../integration/queries/test_migration_progress.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 5c3b0f1d5b..555fbf4437 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -201,16 +201,21 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboards(make_dashboard, make_query, statuses_pending_migration: list[TableMigrationStatus]) -> list[Dashboard]: - query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") - query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") +def dashboard_with_hive_table(make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus]) -> Dashboard: table_migration_status = statuses_pending_migration[0] table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name}") + return Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)) + + +@pytest.fixture +def dashboards(make_dashboard, make_query, dashboard_with_hive_table: Dashboard) -> list[Dashboard]: + query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") + query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") records = [ + dashboard_with_hive_table, Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), - Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)), ] return records From af055f28908570963baad51c87a193494f387445 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:26:12 +0100 Subject: [PATCH 092/129] Let dashboard reference all Hive tables --- .../queries/test_migration_progress.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 555fbf4437..7ef292e831 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -83,6 +83,13 @@ def statuses_pending_migration(table_migration_statuses: list[TableMigrationStat return records +@pytest.fixture +def statuses_migrated(table_migration_statuses: list[TableMigrationStatus]) -> list[TableMigrationStatus]: + records = [status for status in table_migration_statuses if status.dst_catalog is not None] + assert records, "Expecting a migrated table" + return records + + @pytest.fixture def udfs() -> list[Udf]: records = [ @@ -201,19 +208,21 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboard_with_hive_table(make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus]) -> Dashboard: - table_migration_status = statuses_pending_migration[0] - table_full_name = ".".join(["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table]) - query_with_hive_table = make_query(sql_query=f"SELECT * FROM {table_full_name}") +def dashboard_with_hive_tables(make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus]) -> Dashboard: + table_full_names = [] + for status in statuses_pending_migration: + table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) + table_full_names.append(table_full_name) + query_with_hive_table = make_query(sql_query=f"SELECT * FROM {', '.join(table_full_names)}") return Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)) @pytest.fixture -def dashboards(make_dashboard, make_query, dashboard_with_hive_table: Dashboard) -> list[Dashboard]: +def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") records = [ - dashboard_with_hive_table, + dashboard_with_hive_tables, Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), ] From d140dcd62f6f3eea1e3e0c6a867bb937b086b268 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:27:38 +0100 Subject: [PATCH 093/129] Use dashboard with Hive table --- tests/integration/queries/test_migration_progress.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 7ef292e831..f930a7b398 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -208,7 +208,9 @@ def policies() -> list[PolicyInfo]: @pytest.fixture -def dashboard_with_hive_tables(make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus]) -> Dashboard: +def dashboard_with_hive_tables( + make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus] +) -> Dashboard: table_full_names = [] for status in statuses_pending_migration: table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) @@ -313,10 +315,10 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: def used_tables( ws: WorkspaceClient, make_workspace_file, - dashboards: list[Dashboard], + dashboard_with_hive_tables: Dashboard, statuses_pending_migration: list[TableMigrationStatus], ) -> list[UsedTable]: - dashboard = dashboards[-1] + dashboard = dashboard_with_hive_tables query = ws.queries.get(dashboard.query_ids[0]) assert query.id is not None and query.display_name is not None and dashboard.name is not None records = [] From a0b7db4493a8b62a82e3a203ab7c1bcbf6065ec0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:29:27 +0100 Subject: [PATCH 094/129] Handle None attributes --- .../queries/test_migration_progress.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index f930a7b398..6033440aa2 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -245,11 +245,11 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que except sqlglot.ParseError: query_problem = QueryProblem( dashboard.id, - dashboard.parent, - dashboard.name, + dashboard.parent or "UNKNOWN", + dashboard.name or "UNKNOWN", query.id, - query.parent_path, - query.display_name, + query.parent_path or "UNKNOWN", + query.display_name or "UNKNOWN", "sql-parse-error", "Could not parse SQL", ) @@ -257,11 +257,11 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que if "dbfs://" in query.query_text: query_problem = QueryProblem( dashboard.id, - dashboard.parent, - dashboard.name, + dashboard.parent or "UNKNOWN", + dashboard.name or "UNKNOWN", query.id, - query.parent_path, - query.display_name, + query.parent_path or "UNKNOWN", + query.display_name or "UNKNOWN", "direct-filesystem-access-in-sql-query", "The use of direct filesystem references is deprecated: dbfs://...", ) From ce64297336f2822c71a0c958169ab850b9cad58e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:31:02 +0100 Subject: [PATCH 095/129] Reuse tables migrated --- .../queries/test_migration_progress.py | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 6033440aa2..99030f0e58 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -317,6 +317,7 @@ def used_tables( make_workspace_file, dashboard_with_hive_tables: Dashboard, statuses_pending_migration: list[TableMigrationStatus], + statuses_migrated: list[TableMigrationStatus], ) -> list[UsedTable]: dashboard = dashboard_with_hive_tables query = ws.queries.get(dashboard.query_ids[0]) @@ -367,24 +368,25 @@ def used_tables( assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ) records.extend([used_python_table, used_sql_table]) - used_uc_table = UsedTable( - catalog_name="catalog", # This table is migrated - schema_name="staff_db", - table_name="employees", - is_read=False, - is_write=True, - source_id=str(make_workspace_file()), - source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), - source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), - LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), - LineageAtom(object_type="FILE", object_id="my file_path"), - ], - assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), - assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), - ) - records.append(used_uc_table) + for status in statuses_migrated: + used_uc_table = UsedTable( + catalog_name=status.dst_catalog, + schema_name=status.dst_schema, + table_name=status.dst_table, + is_read=False, + is_write=True, + source_id=str(make_workspace_file()), + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), + LineageAtom(object_type="FILE", object_id="my file_path"), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ) + records.append(used_uc_table) return records From 91ee9895d038356ed0b0a860e226a862849237f9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:33:44 +0100 Subject: [PATCH 096/129] Move job with and without failures to separate fixtures --- .../queries/test_migration_progress.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 99030f0e58..1313df2f71 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -144,11 +144,21 @@ def grants() -> list[Grant]: @pytest.fixture -def jobs() -> list[JobInfo]: +def job_without_failures() -> JobInfo: + return JobInfo("1", success=1, failures="") + + +@pytest.fixture +def job_with_failures() -> JobInfo: + return JobInfo("3", success=0, failures="") # Failure come from workflow problems below + + +@pytest.fixture +def jobs(job_without_failures: JobInfo, job_with_failures: JobInfo) -> list[JobInfo]: records = [ - JobInfo("1", success=1, failures=""), + job_without_failures, JobInfo("2", success=0, failures='["No isolation shared clusters not supported in UC"]'), - JobInfo("3", success=0, failures=""), # Failure from workflow problems below + job_with_failures, ] return records From d5247a9bc662d513c68ec3f17149f737bec70429 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:34:21 +0100 Subject: [PATCH 097/129] Link job with failures to workflow problem --- tests/integration/queries/test_migration_progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 1313df2f71..1f33473964 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -164,11 +164,11 @@ def jobs(job_without_failures: JobInfo, job_with_failures: JobInfo) -> list[JobI @pytest.fixture -def workflow_problems() -> list[JobProblem]: +def workflow_problems(job_with_failures: JobInfo) -> list[JobProblem]: """Workflow problems are detected by the linter""" records = [ JobProblem( - job_id=3, + job_id=int(job_with_failures.job_id), job_name="Job", task_key="4", path="file.py", From 0c37e558424c68f3b00ab73e32a0524fe8636afc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:35:05 +0100 Subject: [PATCH 098/129] Add docstring --- tests/integration/queries/test_migration_progress.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 1f33473964..068d970e1e 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -221,6 +221,7 @@ def policies() -> list[PolicyInfo]: def dashboard_with_hive_tables( make_query, make_dashboard, statuses_pending_migration: list[TableMigrationStatus] ) -> Dashboard: + """A dashboard with all the Hive tables pending migration""" table_full_names = [] for status in statuses_pending_migration: table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) @@ -243,6 +244,12 @@ def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard @pytest.fixture def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[QueryProblem]: + """Query problems + + Supported problem codes: + - sql-parse-error + - direct-filesystem-access-in-sql-query + """ records = [] for dashboard in dashboards: if len(dashboard.query_ids) == 0: From 345af2750ce86ad8da8cbe02f299ff9c0d414267 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:38:46 +0100 Subject: [PATCH 099/129] Reuse job wit and without failures --- tests/integration/queries/test_migration_progress.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 068d970e1e..d148e2782f 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -333,6 +333,8 @@ def used_tables( ws: WorkspaceClient, make_workspace_file, dashboard_with_hive_tables: Dashboard, + job_with_failures: JobInfo, + job_without_failures: JobInfo, statuses_pending_migration: list[TableMigrationStatus], statuses_migrated: list[TableMigrationStatus], ) -> list[UsedTable]: @@ -357,8 +359,8 @@ def used_tables( source_id=str(workspace_file), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id=job_with_failures.job_id, other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id=f"{job_with_failures.job_id}/my_task_id"), LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id=str(workspace_file)), ], @@ -395,8 +397,8 @@ def used_tables( source_id=str(make_workspace_file()), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id=job_without_failures.job_id, other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id=f"{job_without_failures}/my_task_id"), LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id="my file_path"), ], From 9c636acb9640e4865640635fc37303dac2f1f73a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:45:55 +0100 Subject: [PATCH 100/129] Remove redundant UsedTable LineageAtoms --- tests/integration/queries/test_migration_progress.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index d148e2782f..094bd259eb 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -361,8 +361,6 @@ def used_tables( source_lineage=[ LineageAtom(object_type="WORKFLOW", object_id=job_with_failures.job_id, other={"name": "my_workflow"}), LineageAtom(object_type="TASK", object_id=f"{job_with_failures.job_id}/my_task_id"), - LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), - LineageAtom(object_type="FILE", object_id=str(workspace_file)), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), @@ -399,8 +397,6 @@ def used_tables( source_lineage=[ LineageAtom(object_type="WORKFLOW", object_id=job_without_failures.job_id, other={"name": "my_workflow"}), LineageAtom(object_type="TASK", object_id=f"{job_without_failures}/my_task_id"), - LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), - LineageAtom(object_type="FILE", object_id="my file_path"), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), From 1adb5fc168a30179a6865e26e87d5c007a0cd42c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:47:24 +0100 Subject: [PATCH 101/129] Handle None --- tests/integration/queries/test_migration_progress.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 094bd259eb..61cc271009 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -386,6 +386,7 @@ def used_tables( ) records.extend([used_python_table, used_sql_table]) for status in statuses_migrated: + assert status.dst_catalog and status.dst_schema and status.dst_table, "Migrated tables are missing destination" used_uc_table = UsedTable( catalog_name=status.dst_catalog, schema_name=status.dst_schema, @@ -395,7 +396,9 @@ def used_tables( source_id=str(make_workspace_file()), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id=job_without_failures.job_id, other={"name": "my_workflow"}), + LineageAtom( + object_type="WORKFLOW", object_id=job_without_failures.job_id, other={"name": "my_workflow"} + ), LineageAtom(object_type="TASK", object_id=f"{job_without_failures}/my_task_id"), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), From 29e979b354b2b5a9574e0341b5671a1c2d8232cf Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 12:51:42 +0100 Subject: [PATCH 102/129] Move dbfs location to separate fixture --- .../queries/test_migration_progress.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 61cc271009..4b8e4ca10c 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -231,9 +231,14 @@ def dashboard_with_hive_tables( @pytest.fixture -def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard) -> list[Dashboard]: +def dbfs_location() -> str: + return "dbfs://folder/file.csv" + + +@pytest.fixture +def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard, dbfs_location: str) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") - query_with_dfsa = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") + query_with_dfsa = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ dashboard_with_hive_tables, Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), @@ -243,7 +248,7 @@ def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard @pytest.fixture -def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[QueryProblem]: +def query_problems(ws: WorkspaceClient, dashboards: list[Dashboard], dbfs_location: str) -> list[QueryProblem]: """Query problems Supported problem codes: @@ -271,7 +276,7 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que "Could not parse SQL", ) records.append(query_problem) - if "dbfs://" in query.query_text: + if dbfs_location in query.query_text: query_problem = QueryProblem( dashboard.id, dashboard.parent or "UNKNOWN", @@ -280,20 +285,20 @@ def query_problems(dashboards: list[Dashboard], ws: WorkspaceClient) -> list[Que query.parent_path or "UNKNOWN", query.display_name or "UNKNOWN", "direct-filesystem-access-in-sql-query", - "The use of direct filesystem references is deprecated: dbfs://...", + f"The use of direct filesystem references is deprecated: {dbfs_location}", ) records.append(query_problem) return records @pytest.fixture -def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: +def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsAccess]: # TODO: Match the DFSAs with a job and dashboard - workspace_file = make_workspace_file(content='df = spark.read.csv("dbfs://folder/file.csv")') - query = make_query(sql_query="SELECT * FROM csv.`dbfs://folder/file.csv`") + workspace_file = make_workspace_file(content=f'df = spark.read.csv("{dbfs_location}")') + query = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ DirectFsAccess( - path="dbfs://folder/file.csv", + path=dbfs_location, is_read=False, # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to # the current user, which we can test below. @@ -310,7 +315,7 @@ def dfsas(make_workspace_file, make_query) -> list[DirectFsAccess]: assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ), DirectFsAccess( - path="dbfs://folder/file.csv", + path=dbfs_location, is_read=False, # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to # the current user, which we can test below. From 23f5609e4e8856db9fdde74b7c680920a130a0c4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 13:01:11 +0100 Subject: [PATCH 103/129] Split used tables --- .../queries/test_migration_progress.py | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 4b8e4ca10c..f5093ad9b2 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -334,29 +334,26 @@ def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsA @pytest.fixture -def used_tables( +def used_hive_tables( ws: WorkspaceClient, make_workspace_file, - dashboard_with_hive_tables: Dashboard, job_with_failures: JobInfo, - job_without_failures: JobInfo, + dashboard_with_hive_tables: Dashboard, statuses_pending_migration: list[TableMigrationStatus], - statuses_migrated: list[TableMigrationStatus], ) -> list[UsedTable]: - dashboard = dashboard_with_hive_tables + """The Hive tables are added to the `job_with_failures` and `dashboard_with_hive_tables`.""" + job, dashboard = job_with_failures, dashboard_with_hive_tables query = ws.queries.get(dashboard.query_ids[0]) assert query.id is not None and query.display_name is not None and dashboard.name is not None records = [] - for table_migration_status in statuses_pending_migration: - table_full_name = ".".join( - ["hive_metastore", table_migration_status.src_schema, table_migration_status.src_table] - ) + for status in statuses_pending_migration: + table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) assert table_full_name in (query.query_text or ""), f"Expecting table '{table_full_name} in query: {query.id}" workspace_file = make_workspace_file(content=f'df = spark.read.table("{table_full_name}")\ndisplay(df)') used_python_table = UsedTable( catalog_name="hive_metastore", - schema_name=table_migration_status.src_schema, - table_name=table_migration_status.src_table, + schema_name=status.src_schema, + table_name=status.src_table, is_read=False, # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to # the current user, which we can test below. @@ -364,16 +361,16 @@ def used_tables( source_id=str(workspace_file), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id=job_with_failures.job_id, other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id=f"{job_with_failures.job_id}/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id=job.job_id, other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id=f"{job.job_id}/my_task_id"), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ) used_sql_table = UsedTable( catalog_name="hive_metastore", - schema_name=table_migration_status.src_schema, - table_name=table_migration_status.src_table, + schema_name=status.src_schema, + table_name=status.src_table, is_read=False, # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to # the current user, which we can test below. @@ -390,6 +387,19 @@ def used_tables( assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ) records.extend([used_python_table, used_sql_table]) + return records + + +@pytest.fixture +def used_uc_tables( + make_workspace_file, + job_without_failures: JobInfo, + statuses_migrated: list[TableMigrationStatus], +) -> list[UsedTable]: + """The UC tables are used by the job without failures.""" + job = job_without_failures + workspace_file = make_workspace_file() + records = [] for status in statuses_migrated: assert status.dst_catalog and status.dst_schema and status.dst_table, "Migrated tables are missing destination" used_uc_table = UsedTable( @@ -398,13 +408,11 @@ def used_tables( table_name=status.dst_table, is_read=False, is_write=True, - source_id=str(make_workspace_file()), + source_id=str(workspace_file), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom( - object_type="WORKFLOW", object_id=job_without_failures.job_id, other={"name": "my_workflow"} - ), - LineageAtom(object_type="TASK", object_id=f"{job_without_failures}/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id=job.job_id, other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id=f"{job}/my_task_id"), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), @@ -413,6 +421,11 @@ def used_tables( return records +@pytest.fixture +def used_tables(used_hive_tables: list[UsedTable], used_uc_tables: list[UsedTable]) -> list[UsedTable]: + return used_hive_tables + used_uc_tables + + @pytest.fixture def catalog_populated( # pylint: disable=too-many-arguments runtime_ctx: MockRuntimeContext, From 494c5e58a7a1db8cdc2a6ebc8293d1ea203c5e77 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 13:06:56 +0100 Subject: [PATCH 104/129] Format --- tests/integration/queries/test_migration_progress.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index f5093ad9b2..e9dbe677aa 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -236,7 +236,12 @@ def dbfs_location() -> str: @pytest.fixture -def dashboards(make_dashboard, make_query, dashboard_with_hive_tables: Dashboard, dbfs_location: str) -> list[Dashboard]: +def dashboards( + make_dashboard, + make_query, + dashboard_with_hive_tables: Dashboard, + dbfs_location: str, +) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") query_with_dfsa = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ From 9ccacf780e90832d3ad1a4e0d5ab302429c3d27d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 13:56:48 +0100 Subject: [PATCH 105/129] Create a catalog and schema for migrated tables --- tests/integration/queries/test_migration_progress.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index e9dbe677aa..5172e48419 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -65,11 +65,13 @@ def tables() -> list[Table]: @pytest.fixture -def table_migration_statuses(tables: list[Table]) -> list[TableMigrationStatus]: +def table_migration_statuses(make_catalog, make_schema, tables: list[Table]) -> list[TableMigrationStatus]: + catalog = make_catalog() + schema = make_schema(catalog_name=catalog.name) records = [] for table in tables: if table.database == "schema1": # schema1 tables are migrated - migration_status = TableMigrationStatus(table.database, table.name, "catalog", table.database, table.name) + migration_status = TableMigrationStatus(table.database, table.name, catalog.name, schema.name, table.name) else: migration_status = TableMigrationStatus(table.database, table.name) records.append(migration_status) From edec9fc1653b1f66cf75d2585ca13b15947471dc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 14:02:09 +0100 Subject: [PATCH 106/129] Create the tables --- .../queries/test_migration_progress.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 5172e48419..b11dfb07b2 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -55,25 +55,29 @@ def workflow_runs(ws: WorkspaceClient) -> list[WorkflowRun]: @pytest.fixture -def tables() -> list[Table]: +def tables(make_schema, make_table) -> list[Table]: records = [] - for schema in "schema1", "schema2": - for table_name in "table1", "table2", "table3", "table4", "table5": - table = Table("hive_metastore", schema, table_name, "MANAGED", "delta") + for _ in range(2): + schema = make_schema() + for _ in range(5): + table = Table.from_table_info(make_table(schema_name=schema.name)) records.append(table) return records @pytest.fixture -def table_migration_statuses(make_catalog, make_schema, tables: list[Table]) -> list[TableMigrationStatus]: +def table_migration_statuses(make_catalog, make_schema, make_table, tables: list[Table]) -> list[TableMigrationStatus]: catalog = make_catalog() schema = make_schema(catalog_name=catalog.name) records = [] - for table in tables: - if table.database == "schema1": # schema1 tables are migrated - migration_status = TableMigrationStatus(table.database, table.name, catalog.name, schema.name, table.name) - else: - migration_status = TableMigrationStatus(table.database, table.name) + for table in tables[: int(len(tables) / 2)]: # First half is migrated + migrated_table = make_table(catalog_name=catalog.name, schema_name=schema.name, name=table.name) + migration_status = TableMigrationStatus( + table.database, table.name, migrated_table.catalog_name, migrated_table.schema_name, migrated_table.name + ) + records.append(migration_status) + for table in tables[int(len(tables) / 2) :]: # Second half is pending migration + migration_status = TableMigrationStatus(table.database, table.name) records.append(migration_status) return records From b92fbdbdb70a5832df76753d2a3f54def9574cf7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 14:06:15 +0100 Subject: [PATCH 107/129] Add dashboard with UC tables fixture --- .../queries/test_migration_progress.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index b11dfb07b2..36d787dcff 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -233,7 +233,22 @@ def dashboard_with_hive_tables( table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) table_full_names.append(table_full_name) query_with_hive_table = make_query(sql_query=f"SELECT * FROM {', '.join(table_full_names)}") - return Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)) + dashboard = Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)) + return dashboard + + +@pytest.fixture +def dashboard_with_uc_tables( + make_query, make_dashboard, statuses_migrated: list[TableMigrationStatus] +) -> Dashboard: + """A dashboard with all the UC migrated tables""" + table_full_names = [] + for status in statuses_migrated: + table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) + table_full_names.append(table_full_name) + query_with_hive_table = make_query(sql_query=f"SELECT * FROM {', '.join(table_full_names)}") + dashboard = Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_hive_table)) + return dashboard @pytest.fixture From bf48e278ba0d0fc4e6ad499551e41d224db7ca98 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 14:06:47 +0100 Subject: [PATCH 108/129] Add dashboard with UC tables to dashboards --- tests/integration/queries/test_migration_progress.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 36d787dcff..be6f72ad6d 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -261,6 +261,7 @@ def dashboards( make_dashboard, make_query, dashboard_with_hive_tables: Dashboard, + dashboard_with_uc_tables: Dashboard, dbfs_location: str, ) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") @@ -269,6 +270,7 @@ def dashboards( dashboard_with_hive_tables, Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), + dashboard_with_uc_tables, ] return records From 426d9780736667838d3cf013e18d21f94653794f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 14:20:53 +0100 Subject: [PATCH 109/129] Add used tables to dashboard --- .../queries/test_migration_progress.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index be6f72ad6d..4ee4bfddd3 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -238,9 +238,7 @@ def dashboard_with_hive_tables( @pytest.fixture -def dashboard_with_uc_tables( - make_query, make_dashboard, statuses_migrated: list[TableMigrationStatus] -) -> Dashboard: +def dashboard_with_uc_tables(make_query, make_dashboard, statuses_migrated: list[TableMigrationStatus]) -> Dashboard: """A dashboard with all the UC migrated tables""" table_full_names = [] for status in statuses_migrated: @@ -420,17 +418,21 @@ def used_hive_tables( @pytest.fixture def used_uc_tables( + ws, make_workspace_file, job_without_failures: JobInfo, + dashboard_with_uc_tables: Dashboard, statuses_migrated: list[TableMigrationStatus], ) -> list[UsedTable]: """The UC tables are used by the job without failures.""" - job = job_without_failures + job, dashboard = job_without_failures, dashboard_with_uc_tables + query = ws.queries.get(dashboard.query_ids[0]) + assert query.id is not None and query.display_name is not None and dashboard.name is not None workspace_file = make_workspace_file() records = [] for status in statuses_migrated: assert status.dst_catalog and status.dst_schema and status.dst_table, "Migrated tables are missing destination" - used_uc_table = UsedTable( + used_python_table = UsedTable( catalog_name=status.dst_catalog, schema_name=status.dst_schema, table_name=status.dst_table, @@ -445,7 +447,26 @@ def used_uc_tables( assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), ) - records.append(used_uc_table) + used_sql_table = UsedTable( + catalog_name="hive_metastore", + schema_name=status.src_schema, + table_name=status.src_table, + is_read=False, + # Technically, the mocked code is reading the table, but marking it as write allows us to set the owner to + # the current user, which we can test below. + is_write=True, + source_id=query.id, + source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), + source_lineage=[ + LineageAtom(object_type="DASHBOARD", object_id=dashboard.id, other={"name": dashboard.name}), + LineageAtom( + object_type="QUERY", object_id=f"{dashboard.id}/{query.id}", other={"name": query.display_name} + ), + ], + assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), + assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), + ) + records.extend([used_python_table, used_sql_table]) return records From c361d0783e63d78ae9d34b94330aacb12b01373a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 14:33:08 +0100 Subject: [PATCH 110/129] Update table ownership --- .../queries/test_migration_progress.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 4ee4bfddd3..8dd1cb297d 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -129,23 +129,18 @@ def udfs() -> list[Udf]: @pytest.fixture -def grants() -> list[Grant]: +def grants(tables: list[Table]) -> list[Grant]: records = [ Grant("service_principal", "USAGE", "hive_metastore"), Grant("Eric", "OWN", "hive_metastore", "sales"), Grant("Liran", "DENY", "hive_metastore", "sales"), # DENY creates a failure - # Set ownership of mocked tables above - Grant("Andrew", "OWN", "hive_metastore", "schema1", "table1"), - Grant("Eric", "OWN", "hive_metastore", "schema1", "table2"), - Grant("Cor", "OWN", "hive_metastore", "schema1", "table3"), - Grant("Cor", "OWN", "hive_metastore", "schema1", "table4"), - Grant("Cor", "OWN", "hive_metastore", "schema1", "table5"), - Grant("Andrew", "OWN", "hive_metastore", "schema2", "table1"), - Grant("Cor", "OWN", "hive_metastore", "schema2", "table2"), - Grant("Cor", "OWN", "hive_metastore", "schema2", "table3"), - Grant("Cor", "OWN", "hive_metastore", "schema2", "table4"), - Grant("Cor", "OWN", "hive_metastore", "schema2", "table5"), + Grant("Andrew", "OWN", tables[0].catalog, tables[0].database, tables[0].name), + Grant("Eric", "OWN", tables[1].catalog, tables[1].database, tables[1].name), + Grant("Andrew", "OWN", tables[-1].catalog, tables[-1].database, tables[-1].name), ] + for table in tables[2:-1]: # Remaining tables + grant = Grant("Cor", "OWN", table.catalog, table.database, table.name) + records.append(grant) return records From 75f42b8b0349af7d07e9a01a875c89414d28f7f6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:33:09 +0100 Subject: [PATCH 111/129] Fix query names in test --- tests/integration/queries/test_migration_progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 8dd1cb297d..cd7ef69788 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -705,7 +705,7 @@ def test_migration_progress_query_data_asset_references_pending_migration_overvi sql_backend: SqlBackend, ) -> None: """Separate test is required to set the owner of the used table at runtime""" - query_name = "03_04_data_asset_references_pending_migration_overview" + query_name = "03_04_dashboards_pending_migration_by_owner_overview" current_user = ws.current_user.me().user_name rows = [ Row( @@ -739,7 +739,7 @@ def test_migration_progress_query_data_asset_references_pending_migration( used_tables: list[UsedTable], ) -> None: """Separate test is required to set the dfsas and used table dynamically""" - query_name = "03_05_data_asset_references_pending_migration" + query_name = "03_05_dashboards_pending_migration" workspace_id = ws.get_workspace_id() current_user = ws.current_user.me().user_name rows = [] From c89bd0e26c75fd0af470fea6af2e57d476257fdb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:34:14 +0100 Subject: [PATCH 112/129] Fix number of dashboards pending migration --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index cd7ef69788..1b7e019926 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -692,7 +692,7 @@ def test_migration_progress_query_data_asset_references_by_owner_bar_graph( ) -> None: """Separate test is required to set the owner of the used table at runtime""" query_name = "03_02_dashboards_pending_migration_by_owner_bar_graph" - rows = [Row(owner=ws.current_user.me().user_name, count=1)] + rows = [Row(owner=ws.current_user.me().user_name, count=3)] datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) From cf890b9a7d67c1ad7a51099e76bb2ee569b1997d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:37:36 +0100 Subject: [PATCH 113/129] Fix expected row for dashboard pending migration --- .../queries/test_migration_progress.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 1b7e019926..36aa88a89e 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -710,19 +710,10 @@ def test_migration_progress_query_data_asset_references_pending_migration_overvi rows = [ Row( owner=current_user, - object_type="Direct filesystem access", - percentage=0, - total=2, - total_migrated=0, - total_not_migrated=2, - ), - Row( - owner=current_user, - object_type="Table or view reference", - percentage=50, - total=2, + percentage=round(100 * 1 / 4, 2), + total=4, total_migrated=1, - total_not_migrated=1, + total_not_migrated=3, ), ] datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] From b6b8cfc7e7e2e089794fed884c73791ffedc101b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:40:47 +0100 Subject: [PATCH 114/129] Move distinct failure per object type to the bottom --- ...ect_type.sql => 99_99_distinct_failures_per_object_type.sql} | 0 tests/integration/queries/test_migration_progress.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename src/databricks/labs/ucx/queries/progress/main/{01_08_distinct_failures_per_object_type.sql => 99_99_distinct_failures_per_object_type.sql} (100%) diff --git a/src/databricks/labs/ucx/queries/progress/main/01_08_distinct_failures_per_object_type.sql b/src/databricks/labs/ucx/queries/progress/main/99_99_distinct_failures_per_object_type.sql similarity index 100% rename from src/databricks/labs/ucx/queries/progress/main/01_08_distinct_failures_per_object_type.sql rename to src/databricks/labs/ucx/queries/progress/main/99_99_distinct_failures_per_object_type.sql diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 36aa88a89e..56c9f28750 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -607,7 +607,7 @@ def test_migration_progress_dashboard( ("01_06_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_07_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ( - "01_08_distinct_failures_per_object_type", + "99_99_distinct_failures_per_object_type", [ Row( object_type="ClusterInfo", From df9dedb2a95d19502ea4dab4bd8b7b9c619234bb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:41:17 +0100 Subject: [PATCH 115/129] Add dashboard to overall progress --- .../progress/main/01_00_percentage_migration_progress.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql index d429af42cc..7498b8f9c6 100644 --- a/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql +++ b/src/databricks/labs/ucx/queries/progress/main/01_00_percentage_migration_progress.sql @@ -2,4 +2,4 @@ SELECT ROUND(100 * try_divide(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage FROM ucx_catalog.multiworkspace.objects_snapshot -WHERE object_type IN ('ClusterInfo', 'Grant', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') +WHERE object_type IN ('ClusterInfo', 'Grant', 'Dashboard', 'JobInfo', 'PipelineInfo', 'PolicyInfo', 'Table', 'Udf') From a5569a9436da24ded61c9b292ad8fd333be978bb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:42:55 +0100 Subject: [PATCH 116/129] Add dashboard migration progress counter --- .../main/01_08_percentage_dashboard_migration_progress.sql | 5 +++++ tests/integration/queries/test_migration_progress.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 src/databricks/labs/ucx/queries/progress/main/01_08_percentage_dashboard_migration_progress.sql diff --git a/src/databricks/labs/ucx/queries/progress/main/01_08_percentage_dashboard_migration_progress.sql b/src/databricks/labs/ucx/queries/progress/main/01_08_percentage_dashboard_migration_progress.sql new file mode 100644 index 0000000000..db13001a8c --- /dev/null +++ b/src/databricks/labs/ucx/queries/progress/main/01_08_percentage_dashboard_migration_progress.sql @@ -0,0 +1,5 @@ +/* --title 'Dashboard progress (%)' */ +SELECT + ROUND(100 * TRY_DIVIDE(COUNT_IF(SIZE(failures) = 0), COUNT(*)), 2) AS percentage +FROM ucx_catalog.multiworkspace.objects_snapshot +WHERE object_type = "Dashboard" diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 56c9f28750..ccdd4a107c 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -598,7 +598,7 @@ def test_migration_progress_dashboard( @pytest.mark.parametrize( "query_name, rows", [ - ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 39, 2))]), + ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 36, 2))]), ("01_01_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_02_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), ("01_03_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), @@ -606,6 +606,7 @@ def test_migration_progress_dashboard( ("01_05_percentage_table_migration_progress", [Row(percentage=round(100 * 5 / 10, 2))]), ("01_06_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_07_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), + ("01_08_percentage_dashboard_migration_progress", [Row(percentage=round(100 * 1 / 4, 2))]), ( "99_99_distinct_failures_per_object_type", [ From bc04f60187ac5b4c7ed8e679fdf8b23e228d89d7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:45:23 +0100 Subject: [PATCH 117/129] Move fixture rows around --- .../queries/test_migration_progress.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index ccdd4a107c..df646a2ef7 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -607,6 +607,38 @@ def test_migration_progress_dashboard( ("01_06_percentage_pipeline_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_07_percentage_policy_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_08_percentage_dashboard_migration_progress", [Row(percentage=round(100 * 1 / 4, 2))]), + ( + "02_1_pending_migration_data_objects", + [Row(count=5)], + ), + ( + "02_2_migration_status_by_owner_bar_graph", + [Row(owner="Andrew", count=1), Row(owner="Cor", count=4)], + ), + ( + "02_3_migrated_data_objects", + [Row(count=5)], + ), + ( + "02_4_migration_status_by_owner_overview", + [ + Row(owner="Andrew", percentage=round(100 * 1 / 2, 2), total=2, total_migrated=1, total_not_migrated=1), + Row(owner="Cor", percentage=round(100 * 3 / 7, 2), total=7, total_migrated=3, total_not_migrated=4), + Row(owner="Eric", percentage=round(100 * 1 / 1, 2), total=1, total_migrated=1, total_not_migrated=0), + ], + ), + ( + "03_01_dashboards_pending_migration", + [ + Row(count=3), + ], + ), + ( + "03_03_dashboards_migrated", + [ + Row(count=1), + ], + ), ( "99_99_distinct_failures_per_object_type", [ @@ -640,38 +672,6 @@ def test_migration_progress_dashboard( Row(object_type="Udf", count=1, failure="UDF not supported by UC"), ], ), - ( - "02_1_pending_migration_data_objects", - [Row(count=5)], - ), - ( - "02_2_migration_status_by_owner_bar_graph", - [Row(owner="Andrew", count=1), Row(owner="Cor", count=4)], - ), - ( - "02_3_migrated_data_objects", - [Row(count=5)], - ), - ( - "02_4_migration_status_by_owner_overview", - [ - Row(owner="Andrew", percentage=round(100 * 1 / 2, 2), total=2, total_migrated=1, total_not_migrated=1), - Row(owner="Cor", percentage=round(100 * 3 / 7, 2), total=7, total_migrated=3, total_not_migrated=4), - Row(owner="Eric", percentage=round(100 * 1 / 1, 2), total=1, total_migrated=1, total_not_migrated=0), - ], - ), - ( - "03_01_dashboards_pending_migration", - [ - Row(count=3), - ], - ), - ( - "03_03_dashboards_migrated", - [ - Row(count=1), - ], - ), ], ) def test_migration_progress_query( From edfe3e56437ae9483ea3bc9d556899d25acec30c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 15:54:39 +0100 Subject: [PATCH 118/129] Sort by failure --- .../progress/main/03_05_dashboards_pending_migration.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql index bba364e1c3..d6a57ea512 100644 --- a/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql +++ b/src/databricks/labs/ucx/queries/progress/main/03_05_dashboards_pending_migration.sql @@ -32,4 +32,4 @@ SELECT END AS dashboard_link FROM ucx_catalog.multiworkspace.objects_snapshot WHERE object_type = 'Dashboard' AND SIZE(failures) > 0 -ORDER BY workspace_id, owner, name +ORDER BY workspace_id, owner, name, failure From 1facdd26aba53948c8b433115c4a23c425ac33a7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 17:56:42 +0100 Subject: [PATCH 119/129] Test subset of dashboards pending migration --- .../queries/test_migration_progress.py | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index df646a2ef7..ec248d1b00 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -727,42 +727,26 @@ def test_migration_progress_query_data_asset_references_pending_migration( ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, - dfsas: list[DirectFsAccess], - used_tables: list[UsedTable], + dashboard_with_hive_tables: Dashboard, + statuses_pending_migration: list[TableMigrationStatus], ) -> None: - """Separate test is required to set the dfsas and used table dynamically""" + """Test the pending tables are mentioned""" query_name = "03_05_dashboards_pending_migration" workspace_id = ws.get_workspace_id() current_user = ws.current_user.me().user_name rows = [] - for dfsa in dfsas: - link_prefix = "/sql/editor/" if dfsa.source_type == "QUERY" else "/#workspace" - row = Row( - workspace_id=workspace_id, - owner=current_user, - object_type="Direct filesystem access", - object_id=dfsas[0].path, - failure="Direct filesystem access is not supported in Unity Catalog", - is_read=False, - is_write=True, - link=f"{link_prefix}{dfsa.source_id}", - ) - rows.append(row) - for used_table in used_tables: - if used_table.catalog_name != "hive_metastore": - continue + for status in statuses_pending_migration: + table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) row = Row( workspace_id=workspace_id, owner=current_user, - object_type="Table or view reference", - object_id=f"{used_table.catalog_name}.{used_table.schema_name}.{used_table.table_name}", - failure="Pending migration", - is_read=False, - is_write=True, - link=f"/#workspace{used_table.source_id}", + dashboard_type="Redash", + failure=f"Pending migration: {table_full_name}", + dashboard_link=f"/sql/dashboards/{dashboard_with_hive_tables.id}" ) rows.append(row) datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) - assert query_results == rows + query_results_filtered = [r for r in query_results if r.name == dashboard_with_hive_tables.name] + assert query_results_filtered == sorted(rows, key=lambda el: el.failure) From afd30df2adc124d236ad4d4fd6c15dc691e63dab Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 20:02:37 +0100 Subject: [PATCH 120/129] Rename tests --- tests/integration/queries/test_migration_progress.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index ec248d1b00..6f971712d0 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -686,7 +686,7 @@ def test_migration_progress_query( assert query_results == rows -def test_migration_progress_query_data_asset_references_by_owner_bar_graph( +def test_migration_progress_query_dashboard_pending_migration_by_owner_bar_graph( ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, @@ -700,7 +700,7 @@ def test_migration_progress_query_data_asset_references_by_owner_bar_graph( assert query_results == rows -def test_migration_progress_query_data_asset_references_pending_migration_overview( +def test_migration_progress_query_dashboards_pending_migration_by_owner_overview( ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, @@ -723,14 +723,14 @@ def test_migration_progress_query_data_asset_references_pending_migration_overvi assert query_results == rows -def test_migration_progress_query_data_asset_references_pending_migration( +def test_migration_progress_query_dashboards_pending_migration( ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, dashboard_with_hive_tables: Dashboard, statuses_pending_migration: list[TableMigrationStatus], ) -> None: - """Test the pending tables are mentioned""" + """Test if the tables migration are mentioned""" query_name = "03_05_dashboards_pending_migration" workspace_id = ws.get_workspace_id() current_user = ws.current_user.me().user_name From 98dc3325e711524c961b819f10d8d098caf2e566 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 20:05:00 +0100 Subject: [PATCH 121/129] Format --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 6f971712d0..3302bbb6b8 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -742,7 +742,7 @@ def test_migration_progress_query_dashboards_pending_migration( owner=current_user, dashboard_type="Redash", failure=f"Pending migration: {table_full_name}", - dashboard_link=f"/sql/dashboards/{dashboard_with_hive_tables.id}" + dashboard_link=f"/sql/dashboards/{dashboard_with_hive_tables.id}", ) rows.append(row) datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] From d99be35869aafbf69a3792d035a2eb72ffa64f60 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 20:17:34 +0100 Subject: [PATCH 122/129] Fix total percentage --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 3302bbb6b8..8ac71cae4a 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -598,7 +598,7 @@ def test_migration_progress_dashboard( @pytest.mark.parametrize( "query_name, rows", [ - ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 36, 2))]), + ("01_00_percentage_migration_progress", [Row(percentage=round(100 * 23 / 38, 2))]), ("01_01_percentage_udf_migration_progress", [Row(percentage=round(100 * 1 / 2, 2))]), ("01_02_percentage_grant_migration_progress", [Row(percentage=round(100 * 12 / 13, 2))]), ("01_03_percentage_job_migration_progress", [Row(percentage=round(100 * 1 / 3, 2))]), From 8ea471cfd44ab138722bedf8b1fb0bd5cb21c0b9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 18 Dec 2024 20:48:25 +0100 Subject: [PATCH 123/129] Add missing field --- tests/integration/queries/test_migration_progress.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 8ac71cae4a..3349734d01 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -740,6 +740,7 @@ def test_migration_progress_query_dashboards_pending_migration( row = Row( workspace_id=workspace_id, owner=current_user, + name=dashboard_with_hive_tables.name, dashboard_type="Redash", failure=f"Pending migration: {table_full_name}", dashboard_link=f"/sql/dashboards/{dashboard_with_hive_tables.id}", From b7df08e9887d2cd6e78f67321cdd5a7e8d2de1f3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 20 Dec 2024 11:35:09 +0100 Subject: [PATCH 124/129] Exclude owner from Redash checks --- .../queries/test_migration_progress.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 3349734d01..bdb9c2c6de 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -686,31 +686,41 @@ def test_migration_progress_query( assert query_results == rows +def exclude_fields_from_rows(rows: list[Row], *fields: str) -> list[Row]: + """Exclude the owner field from the row.""" + rows_without_fields = [] + for row in rows: + data = row.asDict() + for field in fields: + if field in data: + data.pop(field) + row = Row(**data) + rows_without_fields.append(row) + return rows_without_fields + + def test_migration_progress_query_dashboard_pending_migration_by_owner_bar_graph( - ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, ) -> None: """Separate test is required to set the owner of the used table at runtime""" query_name = "03_02_dashboards_pending_migration_by_owner_bar_graph" - rows = [Row(owner=ws.current_user.me().user_name, count=3)] + rows = [Row(count=3)] datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) - assert query_results == rows + # See `test_redash_dashboard_ownership_is_me` for why we exclude the owner + assert exclude_fields_from_rows(query_results, "owner") == rows def test_migration_progress_query_dashboards_pending_migration_by_owner_overview( - ws: WorkspaceClient, dashboard_metadata: DashboardMetadata, sql_backend: SqlBackend, ) -> None: """Separate test is required to set the owner of the used table at runtime""" query_name = "03_04_dashboards_pending_migration_by_owner_overview" - current_user = ws.current_user.me().user_name rows = [ Row( - owner=current_user, percentage=round(100 * 1 / 4, 2), total=4, total_migrated=1, @@ -719,8 +729,9 @@ def test_migration_progress_query_dashboards_pending_migration_by_owner_overview ] datasets = [d for d in dashboard_metadata.get_datasets() if d.name == query_name] assert len(datasets) == 1, f"Missing query: {query_name}" + # See `test_redash_dashboard_ownership_is_me` for why we exclude the owner query_results = list(sql_backend.fetch(datasets[0].query)) - assert query_results == rows + assert exclude_fields_from_rows(query_results, "owner") == rows def test_migration_progress_query_dashboards_pending_migration( @@ -733,13 +744,11 @@ def test_migration_progress_query_dashboards_pending_migration( """Test if the tables migration are mentioned""" query_name = "03_05_dashboards_pending_migration" workspace_id = ws.get_workspace_id() - current_user = ws.current_user.me().user_name rows = [] for status in statuses_pending_migration: table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) row = Row( workspace_id=workspace_id, - owner=current_user, name=dashboard_with_hive_tables.name, dashboard_type="Redash", failure=f"Pending migration: {table_full_name}", @@ -750,4 +759,5 @@ def test_migration_progress_query_dashboards_pending_migration( assert len(datasets) == 1, f"Missing query: {query_name}" query_results = list(sql_backend.fetch(datasets[0].query)) query_results_filtered = [r for r in query_results if r.name == dashboard_with_hive_tables.name] - assert query_results_filtered == sorted(rows, key=lambda el: el.failure) + # See `test_redash_dashboard_ownership_is_me` for why we exclude the owner + assert exclude_fields_from_rows(query_results_filtered, "owner") == rows From c3026ee251f28f0cc6c1c12b7dca20d9f7682cb6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 20 Dec 2024 11:36:16 +0100 Subject: [PATCH 125/129] Force commit --- tests/integration/progress/test_workflows.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/progress/test_workflows.py b/tests/integration/progress/test_workflows.py index 7ddc2794dc..706d7f547d 100644 --- a/tests/integration/progress/test_workflows.py +++ b/tests/integration/progress/test_workflows.py @@ -9,6 +9,7 @@ @retried(on=[NotFound, InvalidParameterValue], timeout=dt.timedelta(minutes=12)) def test_running_real_migration_progress_job(installation_ctx: MockInstallationContext) -> None: """Ensure that the migration-progress workflow can complete successfully.""" + # Limit the resources crawled by the assessment source_schema = installation_ctx.make_schema() installation_ctx.make_table(schema_name=source_schema.name) From 7043908b0a0bd3a90b60c3e39d6fd1d4cc77a155 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 9 Jan 2025 09:47:50 +0100 Subject: [PATCH 126/129] Fix order of test rows --- tests/integration/queries/test_migration_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index bdb9c2c6de..d17d92e2e1 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -745,7 +745,7 @@ def test_migration_progress_query_dashboards_pending_migration( query_name = "03_05_dashboards_pending_migration" workspace_id = ws.get_workspace_id() rows = [] - for status in statuses_pending_migration: + for status in sorted(statuses_pending_migration, key=lambda s: (s.src_schema, s.src_table)): table_full_name = ".".join(["hive_metastore", status.src_schema, status.src_table]) row = Row( workspace_id=workspace_id, From e9da2655f54512bba9b3657493b9acdf580a0ebc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 9 Jan 2025 11:24:44 +0100 Subject: [PATCH 127/129] Match job with dfsa --- .../queries/test_migration_progress.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index d17d92e2e1..5c218fbccc 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -151,7 +151,12 @@ def job_without_failures() -> JobInfo: @pytest.fixture def job_with_failures() -> JobInfo: - return JobInfo("3", success=0, failures="") # Failure come from workflow problems below + """A job with failures + + - See workflow_problems + - See dfsa + """ + return JobInfo("3", success=0, failures="") @pytest.fixture @@ -314,7 +319,7 @@ def query_problems(ws: WorkspaceClient, dashboards: list[Dashboard], dbfs_locati @pytest.fixture def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsAccess]: - # TODO: Match the DFSAs with a job and dashboard + # TODO: Match the DFSAs with a dashboard workspace_file = make_workspace_file(content=f'df = spark.read.csv("{dbfs_location}")') query = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ @@ -327,8 +332,8 @@ def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsA source_id=str(workspace_file), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="my_workflow_id", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="my_workflow_id/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id="3", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="3/my_task_id"), LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id=str(workspace_file)), ], @@ -486,6 +491,7 @@ def catalog_populated( # pylint: disable=too-many-arguments used_tables: list[UsedTable], query_problems: list[QueryProblem], dashboards: list[Dashboard], + dfsas: list[DirectFsAccess], ): """Populate the UCX catalog with multiworkspace tables. @@ -528,6 +534,20 @@ def catalog_populated( # pylint: disable=too-many-arguments UsedTable, mode='overwrite', ) + # Persists DirectFsAccess to propagate them to Jobs + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.directfs_in_paths', + [dfsa for dfsa in dfsas if dfsa.source_type != "QUERY"], + DirectFsAccess, + mode='overwrite', + ) + # Persists DirectFsAccess to propagate them to Dashboards + runtime_ctx.sql_backend.save_table( + f'hive_metastore.{runtime_ctx.inventory_database}.directfs_in_queries', + [dfsa for dfsa in dfsas if dfsa.source_type == "QUERY"], + DirectFsAccess, + mode='overwrite', + ) # Persists QueryProblems to propagate them to Dashboards runtime_ctx.sql_backend.save_table( f'hive_metastore.{runtime_ctx.inventory_database}.query_problems', From af90edaf09175026dd099f59e45776eefa65d91a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 9 Jan 2025 11:49:02 +0100 Subject: [PATCH 128/129] Match DFSA with Dashboard --- .../queries/test_migration_progress.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index 5c218fbccc..fc25ace8a9 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -254,20 +254,26 @@ def dbfs_location() -> str: return "dbfs://folder/file.csv" +@pytest.fixture +def dashboard_with_dfsa(make_dashboard, make_query, dbfs_location) -> Dashboard: + query_with_dfsa = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") + return Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)) + + @pytest.fixture def dashboards( make_dashboard, make_query, dashboard_with_hive_tables: Dashboard, dashboard_with_uc_tables: Dashboard, + dashboard_with_dfsa: Dashboard, dbfs_location: str, ) -> list[Dashboard]: query_with_invalid_sql = make_query(sql_query="SELECT SUM(1") - query_with_dfsa = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ dashboard_with_hive_tables, Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_invalid_sql)), - Dashboard.from_sdk_redash_dashboard(make_dashboard(query=query_with_dfsa)), + dashboard_with_dfsa, dashboard_with_uc_tables, ] return records @@ -318,10 +324,8 @@ def query_problems(ws: WorkspaceClient, dashboards: list[Dashboard], dbfs_locati @pytest.fixture -def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsAccess]: - # TODO: Match the DFSAs with a dashboard +def dfsas(make_workspace_file, dbfs_location: str, dashboard_with_dfsa: Dashboard) -> list[DirectFsAccess]: workspace_file = make_workspace_file(content=f'df = spark.read.csv("{dbfs_location}")') - query = make_query(sql_query=f"SELECT * FROM csv.`{dbfs_location}`") records = [ DirectFsAccess( path=dbfs_location, @@ -346,11 +350,15 @@ def dfsas(make_workspace_file, make_query, dbfs_location: str) -> list[DirectFsA # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to # the current user, which we can test below. is_write=True, - source_id=query.id, + source_id=dashboard_with_dfsa.query_ids[0], source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), - LineageAtom(object_type="QUERY", object_id=f"my_dashboard_id/{query.id}", other={"name": "my_query"}), + LineageAtom(object_type="DASHBOARD", object_id=dashboard_with_dfsa.id, other={"name": "my_dashboard"}), + LineageAtom( + object_type="QUERY", + object_id=f"{dashboard_with_dfsa.id}/{dashboard_with_dfsa.query_ids[0]}", + other={"name": "my_query"}, + ), ], assessment_start_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=5.0), assessment_end_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(minutes=2.0), @@ -541,13 +549,6 @@ def catalog_populated( # pylint: disable=too-many-arguments DirectFsAccess, mode='overwrite', ) - # Persists DirectFsAccess to propagate them to Dashboards - runtime_ctx.sql_backend.save_table( - f'hive_metastore.{runtime_ctx.inventory_database}.directfs_in_queries', - [dfsa for dfsa in dfsas if dfsa.source_type == "QUERY"], - DirectFsAccess, - mode='overwrite', - ) # Persists QueryProblems to propagate them to Dashboards runtime_ctx.sql_backend.save_table( f'hive_metastore.{runtime_ctx.inventory_database}.query_problems', From 1a6d37c6b758fba107a027ab1adcfc7b27443feb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 9 Jan 2025 15:55:31 +0100 Subject: [PATCH 129/129] Bump job id --- tests/integration/queries/test_migration_progress.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/queries/test_migration_progress.py b/tests/integration/queries/test_migration_progress.py index fc25ace8a9..13350202b8 100644 --- a/tests/integration/queries/test_migration_progress.py +++ b/tests/integration/queries/test_migration_progress.py @@ -327,7 +327,7 @@ def query_problems(ws: WorkspaceClient, dashboards: list[Dashboard], dbfs_locati def dfsas(make_workspace_file, dbfs_location: str, dashboard_with_dfsa: Dashboard) -> list[DirectFsAccess]: workspace_file = make_workspace_file(content=f'df = spark.read.csv("{dbfs_location}")') records = [ - DirectFsAccess( + DirectFsAccess( # TODO: Match with Job path=dbfs_location, is_read=False, # Technically, the mocked code is reading the path, but marking it as write allows us to set the owner to @@ -336,8 +336,8 @@ def dfsas(make_workspace_file, dbfs_location: str, dashboard_with_dfsa: Dashboar source_id=str(workspace_file), source_timestamp=dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=2.0), source_lineage=[ - LineageAtom(object_type="WORKFLOW", object_id="3", other={"name": "my_workflow"}), - LineageAtom(object_type="TASK", object_id="3/my_task_id"), + LineageAtom(object_type="WORKFLOW", object_id="4", other={"name": "my_workflow"}), + LineageAtom(object_type="TASK", object_id="4/my_task_id"), LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id=str(workspace_file)), ],