diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 18ad64578c..2cfbc7b623 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -303,12 +303,23 @@ def _step_list(cls) -> list[str]: step_list.append(task.workflow) return step_list + @staticmethod + def _remove_extra_indentation(doc: str) -> str: + lines = doc.splitlines() + stripped = [] + for line in lines: + if line.startswith(" " * 4): + stripped.append(line[4:]) + else: + stripped.append(line) + return "\n".join(stripped) + def _create_readme(self): md = [ "# UCX - The Unity Catalog Migration Assistant", f'To troubleshoot, see [debug notebook]({self._notebook_link(f"{self._install_folder}/DEBUG.py")}).\n', - "Here are the URL and descriptions of jobs that trigger's various stages of migration.", - "All jobs are defined with necessary cluster configurations and DBR versions.", + "Here are the URLs and descriptions of workflows that trigger various stages of migration.", + "All jobs are defined with necessary cluster configurations and DBR versions.\n", ] for step_name in self._step_list(): if step_name not in self._deployed_steps: @@ -318,16 +329,20 @@ def _create_readme(self): dashboard_link = "" if step_name in self._dashboards: dashboard_link = f"{self._ws.config.host}/sql/dashboards/{self._dashboards[step_name]}" - dashboard_link = f" (see [{step_name} dashboard]({dashboard_link}) after finish)" + dashboard_link = f"Go to the [{step_name} dashboard]({dashboard_link}) after running the jobs." job_link = f"[{self._name(step_name)}]({self._ws.config.host}#job/{job_id})" - md.append(f"## {job_link}{dashboard_link}\n") + md.append("---\n\n") + md.append(f"## {job_link}\n\n") + md.append(f"{dashboard_link}\n\n") + md.append("The workflow consists of the following separate tasks:\n\n") for t in self._sorted_tasks(): if t.workflow != step_name: continue - doc = re.sub(r"\s+", " ", t.doc) + doc = self._remove_extra_indentation(t.doc) doc = self._replace_inventory_variable(doc) - md.append(f" - `{t.name}`: {doc}") - md.append("") + md.append(f"### `{t.name}`\n\n") + md.append(f"{doc}\n") + md.append("\n\n") preamble = ["# Databricks notebook source", "# MAGIC %md"] intro = "\n".join(preamble + [f"# MAGIC {line}" for line in md]) path = f"{self._install_folder}/README.py" diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 834ce57c02..e4498f5051 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,41 +19,36 @@ @task("assessment") def setup_schema(cfg: WorkspaceConfig): - """Creates a database for UCX migration intermediate state""" + """Creates a database for the UCX migration intermediate state. The name comes from the configuration file + and is set with the `inventory_database` key.""" backend = RuntimeBackend() backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{cfg.inventory_database}") @task("assessment", depends_on=[setup_schema], notebook="hive_metastore/tables.scala") def crawl_tables(_: WorkspaceConfig): - """In this procedure, we systematically scan every table stored within the Hive Metastore. This scanning process - retrieves vital information for each table, which includes its distinct identifier or name, table format, and - storage location details. - - The gathered metadata is then subsequently organized and documented within a designated storage entity referred to - as the `$inventory.tables` table. This table serves as an extensive inventory, offering a well-structured and - readily accessible point of reference for users, data engineers, and administrators.""" + """Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such + as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named + `${inventory_database}.tables`. The `inventory_database` placeholder is set in the configuration file. The metadata + stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that + cannot easily be migrated to Unity Catalog.""" @task("assessment", job_cluster="tacl") def setup_tacl(_: WorkspaceConfig): - """(Optimization) Starts tacl job cluster in parallel to crawling tables""" + """(Optimization) Starts `tacl` job cluster in parallel to crawling tables.""" @task("assessment", depends_on=[crawl_tables, setup_tacl], job_cluster="tacl") def crawl_grants(cfg: WorkspaceConfig): - """During this process, our methodology is purposefully designed to systematically scan and retrieve ACLs - (Access Control Lists) associated with Legacy Tables from the Hive Metastore. These ACLs encompass comprehensive - information, including permissions for users and groups, role-based access settings, and any custom access - configurations. These ACLs are then thoughtfully structured and securely stored within the `$inventory.grants` - table. This dedicated table serves as a central repository, ensuring the uninterrupted preservation of access - control data as we transition to the Databricks Unity Catalog. - - By meticulously migrating these Legacy Table ACLs, we guarantee the seamless transfer of the data governance and - security framework established in our legacy Hive Metastore environment to our new Databricks Unity Catalog - setup. This approach not only safeguards data integrity and access control but also ensures a smooth and - secure transition for our data assets. It reinforces our commitment to data security and compliance throughout the - migration process and beyond""" + """Scans the previously created Delta table named `${inventory_database}.tables` and issues a `SHOW GRANTS` + statement for every object to retrieve the permissions it has assigned to it. The permissions include information + such as the _principal_, _action type_, and the _table_ it applies to. This is persisted in the Delta table + `${inventory_database}.grants`. Other, migration related jobs use this inventory table to convert the legacy Table + ACLs to Unity Catalog permissions. + + Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table + ACLs enabled and available for retrieval.""" backend = RuntimeBackend() tables = TablesCrawler(backend, cfg.inventory_database) grants = GrantsCrawler(tables) @@ -62,13 +57,12 @@ def crawl_grants(cfg: WorkspaceConfig): @task("assessment", depends_on=[setup_schema]) def crawl_mounts(cfg: WorkspaceConfig): - """In this segment of the assessment, we will define the scope of the mount points intended for migration into the - Unity Catalog. As these objects are not compatible with the Unity Catalog paradigm, a key component of the - migration process involves transferring them to Unity Catalog External Locations. + """Defines the scope of the _mount points_ intended for migration into Unity Catalog. As these objects are not + compatible with the Unity Catalog paradigm, a key component of the migration process involves transferring them + to Unity Catalog External Locations. The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently - storing this information in the `$inventory.mounts` table. This step enables you to create a snapshot of your - current Mount Point infrastructure, which is crucial for planning the migration.""" + storing this information in the `$inventory.mounts` table. This is crucial for planning the migration.""" ws = WorkspaceClient(config=cfg.to_databricks_config()) mounts = Mounts(backend=RuntimeBackend(), ws=ws, inventory_database=cfg.inventory_database) mounts.inventorize_mounts() @@ -76,16 +70,14 @@ def crawl_mounts(cfg: WorkspaceConfig): @task("assessment", depends_on=[crawl_mounts, crawl_tables]) def guess_external_locations(cfg: WorkspaceConfig): - """In this section of the assessment, our objective is to determine the whereabouts of all the tables. - Specifically, we will focus on identifying locations that utilize Mount Points. Our goal is to identify the - External Locations necessary for a successful migration and store this information in the - `$inventory.external_locations` Table. + """Determines the shared path prefixes of all the tables. Specifically, the focus is on identifying locations that + utilize mount points. The goal is to identify the _external locations_ necessary for a successful migration and + store this information in the `$inventory.external_locations` table. The approach taken in this assessment involves the following steps: - - Extracting all the locations associated with tables that do not use DBFS (with a focus on those - using mount points). - - Scanning all these locations to identify common folders that can accommodate them. - - These identified external locations will be created subsequently prior to the actual table migration""" + - Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead + - Scanning all these locations to identify folders that can act as shared path prefixes + - These identified external locations will be created subsequently prior to the actual table migration""" ws = WorkspaceClient(config=cfg.to_databricks_config()) crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database) crawler.snapshot() @@ -93,13 +85,15 @@ def guess_external_locations(cfg: WorkspaceConfig): @task("assessment", depends_on=[setup_schema]) def assess_jobs(cfg: WorkspaceConfig): - """This module scans through all the jobs and identifies those that are not compatible with UC. + """Scans through all the jobs and identifies those that are not compatible with UC. The list of all the jobs is + stored in the `$inventory.jobs` table. + It looks for: - - Clusters with DBR version earlier than 11.3 + - Clusters with Databricks Runtime (DBR) version earlier than 11.3 - Clusters using Passthrough Authentication - - Clusters with incompatible spark config tags + - Clusters with incompatible Spark config tags - Clusters referencing DBFS locations in one or more config options - Subsequently, the list of all the jobs is stored in the `$inventory.jobs` table.""" + """ ws = WorkspaceClient(config=cfg.to_databricks_config()) crawler = JobsCrawler(ws, RuntimeBackend(), cfg.inventory_database) crawler.snapshot() @@ -107,13 +101,15 @@ def assess_jobs(cfg: WorkspaceConfig): @task("assessment", depends_on=[setup_schema]) def assess_clusters(cfg: WorkspaceConfig): - """This module scan through all the clusters and identifies those that are not compatible with UC. + """Scan through all the clusters and identifies those that are not compatible with UC. The list of all the clusters + is stored in the`$inventory.clusters` table. + It looks for: - - Clusters with DBR version earlier than 11.3 + - Clusters with Databricks Runtime (DBR) version earlier than 11.3 - Clusters using Passthrough Authentication - Clusters with incompatible spark config tags - Clusters referencing DBFS locations in one or more config options - Subsequently, the list of all the clusters is stored in the`$inventory.clusters` table.""" + """ ws = WorkspaceClient(config=cfg.to_databricks_config()) crawler = ClustersCrawler(ws, RuntimeBackend(), cfg.inventory_database) crawler.snapshot() @@ -121,18 +117,10 @@ def assess_clusters(cfg: WorkspaceConfig): @task("assessment", depends_on=[setup_schema]) def crawl_permissions(cfg: WorkspaceConfig): - """As we commence the intricate migration process from Hive Metastore to the Databricks Unity Catalog, a critical - element of this transition is the thorough examination and preservation of permissions linked to a wide array of - Databricks Workspace components. These components encompass a broad spectrum of resources, including clusters, - cluster policies, jobs, models, experiments, SQL warehouses, SQL alerts, dashboards, queries, AWS IAM instance - profiles, and secret scopes. Ensuring the uninterrupted continuity of permissions is of paramount importance, - as it not only upholds data security but also facilitates a smooth and secure migration journey. - - Our carefully designed procedure systematically scans and extracts permissions associated with these diverse - Databricks Workspace objects. This process encompasses rights granted to users and groups, role-based permissions, - custom access configurations, and any specialized policies governing resource access. The results of this - meticulous scan are methodically stored within the `$inventory.permissions` table, which serves as a central - repository for preserving and managing these crucial access control details.""" + """Scans the workspace-local groups and all their permissions. The list is stored in the `$inventory.permissions` + Delta table. + + This is the first step for the _group migration_ process, which is continued in the `migrate-groups` workflow.""" ws = WorkspaceClient(config=cfg.to_databricks_config()) permission_manager = PermissionManager.factory( ws, @@ -151,49 +139,17 @@ def crawl_permissions(cfg: WorkspaceConfig): dashboard="assessment", ) def assessment_report(_: WorkspaceConfig): - """This meticulously prepared report serves the purpose of evaluating and gauging the preparedness of a specific - workspace for a smooth transition to the Unity Catalog. - - Our assessment procedure involves a comprehensive examination of various critical elements, including data schemas, - metadata structures, permissions, access controls, data assets, and dependencies within the workspace. We dive deep - into the intricacies of the current environment, taking into account factors like the complexity of data models, - the intricacy of access control lists (ACLs), the existence of custom scripts, and the overall data ecosystem. - - The outcome of this thorough assessment is a comprehensive report that offers a holistic perspective on the - workspace's readiness for migration to the Databricks Unity Catalog. This report serves as a valuable resource, - provides insights, recommendations, and practical steps to ensure a seamless and successful transition. - It assists data engineers, administrators, and decision-makers in making informed decisions, addressing potential - challenges, and optimizing the migration strategy. - - Through the creation of this readiness assessment report, we demonstrate our commitment to a well-planned, - risk-mitigated migration process. It guarantees that our migration to the Databricks Unity Catalog is not only - efficient but also seamlessly aligns with our data governance, security, and operational requirements, paving the - way for a new era of excellence in data management.""" + """Refreshes the assessment dashboard after all previous tasks have been completed. Note that you can access the + dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" @task("migrate-groups", depends_on=[crawl_permissions]) def migrate_permissions(cfg: WorkspaceConfig): - """As we embark on the complex journey of migrating from Hive Metastore to the Databricks Unity Catalog, - a crucial phase in this transition involves the careful management of permissions. - This intricate process entails several key steps: first, applying permissions to designated backup groups; - second, smoothly substituting workspace groups with account groups; - and finally, applying permissions to these newly established account groups. - - Throughout this meticulous process, we ensure that existing permissions are thoughtfully mapped to backup groups - to maintain robust and consistent data security and access control during the migration. - - Concurrently, we gracefully replace workspace groups with account groups to align with the structure and policies - of the Databricks Unity Catalog. - - Once this transition is complete, we diligently apply permissions to the newly formed account groups, - preserving the existing access control framework while facilitating the seamless integration of data assets into - the Unity Catalog environment. - - This careful orchestration of permissions guarantees the continuity of data security, minimizes disruptions to data - workflows, and ensures a smooth migration experience for both users and administrators. By executing this precise - operation, we not only meet data security and governance standards but also enhance the overall efficiency and - manageability of our data ecosystem, laying the foundation for a new era of data management excellence within our - organization. + """Main phase of the group migration process. It does the following: + - Creates a backup of every workspace-local group, adding a prefix that can be set in the configuration + - Assigns the full set of permissions of the original group to the backup one + - Creates an account-level group with the original name of the workspace-local one + - Assigns the full set of permissions of the original group to the account-level one See [interactive tutorial here](https://app.getreprise.com/launch/myM3VNn/).""" ws = WorkspaceClient(config=cfg.to_databricks_config()) @@ -218,7 +174,8 @@ def migrate_permissions(cfg: WorkspaceConfig): @task("migrate-groups-cleanup", depends_on=[migrate_permissions]) def delete_backup_groups(cfg: WorkspaceConfig): - """Removes workspace-level backup groups""" + """Last step of the group migration process. Removes all workspace-level backup groups, along with their + permissions.""" ws = WorkspaceClient(config=cfg.to_databricks_config()) group_manager = GroupManager(ws, cfg.groups) group_manager.delete_backup_groups() @@ -226,7 +183,8 @@ def delete_backup_groups(cfg: WorkspaceConfig): @task("destroy-schema") def destroy_schema(cfg: WorkspaceConfig): - """Removes the `$inventory` database""" + """This _clean-up_ workflow allows to removes the `$inventory` database, with all the inventory tables created by + the previous workflow runs. Use this to reset the entire state and start with the assessment step again.""" RuntimeBackend().execute(f"DROP DATABASE {cfg.inventory_database} CASCADE") diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index c98ffba9bc..03796e460d 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -324,3 +324,19 @@ def test_create_readme(mocker): p = re.compile(".*wl_1.*n3.*n1.*wl_2.*n2.*") assert p.match(str(args[1])) + + +def test_replace_pydoc(mocker): + ws = mocker.Mock() + install = WorkspaceInstaller(ws) + doc = install._remove_extra_indentation( + """Test1 + Test2 + Test3""" + ) + assert ( + doc + == """Test1 + Test2 +Test3""" + )