Improved documentation for readme notebook (#257)

Markdown handles multiple whitespaces well anyways, and leaving in the indentation allows for formatting list etc. Fixes #253
databrickslabs · Oct 25, 2023 · 10ea553 · 10ea553
1 parent 1963796
commit 10ea553
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 101 deletions.
diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py
@@ -328,12 +328,23 @@ def _step_list(cls) -> list[str]:
                 step_list.append(task.workflow)
         return step_list
 
+    @staticmethod
+    def _remove_extra_indentation(doc: str) -> str:
+        lines = doc.splitlines()
+        stripped = []
+        for line in lines:
+            if line.startswith(" " * 4):
+                stripped.append(line[4:])
+            else:
+                stripped.append(line)
+        return "\n".join(stripped)
+
     def _create_readme(self):
         md = [
             "# UCX - The Unity Catalog Migration Assistant",
             f'To troubleshoot, see [debug notebook]({self._notebook_link(f"{self._install_folder}/DEBUG.py")}).\n',
-            "Here are the URL and descriptions of jobs that trigger's various stages of migration.",
-            "All jobs are defined with necessary cluster configurations and DBR versions.",
+            "Here are the URLs and descriptions of workflows that trigger various stages of migration.",
+            "All jobs are defined with necessary cluster configurations and DBR versions.\n",
         ]
         for step_name in self._step_list():
             if step_name not in self._deployed_steps:
@@ -343,16 +354,20 @@ def _create_readme(self):
             dashboard_link = ""
             if step_name in self._dashboards:
                 dashboard_link = f"{self._ws.config.host}/sql/dashboards/{self._dashboards[step_name]}"
-                dashboard_link = f" (see [{step_name} dashboard]({dashboard_link}) after finish)"
+                dashboard_link = f"Go to the [{step_name} dashboard]({dashboard_link}) after running the jobs."
             job_link = f"[{self._name(step_name)}]({self._ws.config.host}#job/{job_id})"
-            md.append(f"## {job_link}{dashboard_link}\n")
+            md.append("---\n\n")
+            md.append(f"## {job_link}\n\n")
+            md.append(f"{dashboard_link}\n\n")
+            md.append("The workflow consists of the following separate tasks:\n\n")
             for t in self._sorted_tasks():
                 if t.workflow != step_name:
                     continue
-                doc = re.sub(r"\s+", " ", t.doc)
+                doc = self._remove_extra_indentation(t.doc)
                 doc = self._replace_inventory_variable(doc)
-                md.append(f" - `{t.name}`:  {doc}")
-                md.append("")
+                md.append(f"### `{t.name}`\n\n")
+                md.append(f"{doc}\n")
+                md.append("\n\n")
         preamble = ["# Databricks notebook source", "# MAGIC %md"]
         intro = "\n".join(preamble + [f"# MAGIC {line}" for line in md])
         path = f"{self._install_folder}/README.py"

diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -19,41 +19,36 @@
 
 @task("assessment")
 def setup_schema(cfg: WorkspaceConfig):
-    """Creates a database for UCX migration intermediate state"""
+    """Creates a database for the UCX migration intermediate state. The name comes from the configuration file
+    and is set with the `inventory_database` key."""
     backend = RuntimeBackend()
     backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{cfg.inventory_database}")
 
 
 @task("assessment", depends_on=[setup_schema], notebook="hive_metastore/tables.scala")
 def crawl_tables(_: WorkspaceConfig):
-    """In this procedure, we systematically scan every table stored within the Hive Metastore. This scanning process
-    retrieves vital information for each table, which includes its distinct identifier or name, table format, and
-    storage location details.
-
-    The gathered metadata is then subsequently organized and documented within a designated storage entity referred to
-    as the `$inventory.tables` table. This table serves as an extensive inventory, offering a well-structured and
-    readily accessible point of reference for users, data engineers, and administrators."""
+    """Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such
+    as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named
+    `${inventory_database}.tables`. The `inventory_database` placeholder is set in the configuration file. The metadata
+    stored is then used in the subsequent tasks and workflows to, for example,  find all Hive Metastore tables that
+    cannot easily be migrated to Unity Catalog."""
 
 
 @task("assessment", job_cluster="tacl")
 def setup_tacl(_: WorkspaceConfig):
-    """(Optimization) Starts tacl job cluster in parallel to crawling tables"""
+    """(Optimization) Starts `tacl` job cluster in parallel to crawling tables."""
 
 
 @task("assessment", depends_on=[crawl_tables, setup_tacl], job_cluster="tacl")
 def crawl_grants(cfg: WorkspaceConfig):
-    """During this process, our methodology is purposefully designed to systematically scan and retrieve ACLs
-    (Access Control Lists) associated with Legacy Tables from the Hive Metastore. These ACLs encompass comprehensive
-    information, including permissions for users and groups, role-based access settings, and any custom access
-    configurations. These ACLs are then thoughtfully structured and securely stored within the `$inventory.grants`
-    table. This dedicated table serves as a central repository, ensuring the uninterrupted preservation of access
-    control data as we transition to the Databricks Unity Catalog.
-
-    By meticulously migrating these Legacy Table ACLs, we guarantee the seamless transfer of the data governance and
-    security framework established in our legacy Hive Metastore environment to our new Databricks Unity Catalog
-    setup. This approach not only safeguards data integrity and access control but also ensures a smooth and
-    secure transition for our data assets. It reinforces our commitment to data security and compliance throughout the
-    migration process and beyond"""
+    """Scans the previously created Delta table named `${inventory_database}.tables` and issues a `SHOW GRANTS`
+    statement for every object to retrieve the permissions it has assigned to it. The permissions include information
+    such as the _principal_, _action type_, and the _table_ it applies to. This is persisted in the Delta table
+    `${inventory_database}.grants`. Other, migration related jobs use this inventory table to convert the legacy Table
+    ACLs to Unity Catalog  permissions.
+
+    Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
+    ACLs enabled and available for retrieval."""
     backend = RuntimeBackend()
     tables = TablesCrawler(backend, cfg.inventory_database)
     grants = GrantsCrawler(tables)
@@ -62,77 +57,70 @@ def crawl_grants(cfg: WorkspaceConfig):
 
 @task("assessment", depends_on=[setup_schema])
 def crawl_mounts(cfg: WorkspaceConfig):
-    """In this segment of the assessment, we will define the scope of the mount points intended for migration into the
-    Unity Catalog. As these objects are not compatible with the Unity Catalog paradigm, a key component of the
-    migration process involves transferring them to Unity Catalog External Locations.
+    """Defines the scope of the _mount points_ intended for migration into Unity Catalog. As these objects are not
+    compatible with the Unity Catalog paradigm, a key component of the migration process involves transferring them
+    to Unity Catalog External Locations.
 
     The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently
-    storing this information in the `$inventory.mounts` table. This step enables you to create a snapshot of your
-    current Mount Point infrastructure, which is crucial for planning the migration."""
+    storing this information in the `$inventory.mounts` table. This is crucial for planning the migration."""
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     mounts = Mounts(backend=RuntimeBackend(), ws=ws, inventory_database=cfg.inventory_database)
     mounts.inventorize_mounts()
 
 
 @task("assessment", depends_on=[crawl_mounts, crawl_tables])
 def guess_external_locations(cfg: WorkspaceConfig):
-    """In this section of the assessment, our objective is to determine the whereabouts of all the tables.
-    Specifically, we will focus on identifying locations that utilize Mount Points. Our goal is to identify the
-    External Locations necessary for a successful migration and store this information in the
-    `$inventory.external_locations` Table.
+    """Determines the shared path prefixes of all the tables. Specifically, the focus is on identifying locations that
+    utilize mount points. The goal is to identify the _external locations_ necessary for a successful migration and
+    store this information in the `$inventory.external_locations` table.
 
     The approach taken in this assessment involves the following steps:
-    -   Extracting all the locations associated with tables that do not use DBFS (with a focus on those
-    using mount points).
-    -   Scanning all these locations to identify common folders that can accommodate them.
-    -   These identified external locations will be created subsequently prior to the actual table migration"""
+      - Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead
+      - Scanning all these locations to identify folders that can act as shared path prefixes
+      - These identified external locations will be created subsequently prior to the actual table migration"""
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database)
     crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
 def assess_jobs(cfg: WorkspaceConfig):
-    """This module scans through all the jobs and identifies those that are not compatible with UC.
+    """Scans through all the jobs and identifies those that are not compatible with UC. The list of all the jobs is
+    stored in the `$inventory.jobs` table.
+
     It looks for:
-      - Clusters with DBR version earlier than 11.3
+      - Clusters with Databricks Runtime (DBR) version earlier than 11.3
       - Clusters using Passthrough Authentication
-      - Clusters with incompatible spark config tags
+      - Clusters with incompatible Spark config tags
       - Clusters referencing DBFS locations in one or more config options
-    Subsequently, the list of all the jobs is stored in the `$inventory.jobs` table."""
+    """
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     crawler = JobsCrawler(ws, RuntimeBackend(), cfg.inventory_database)
     crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
 def assess_clusters(cfg: WorkspaceConfig):
-    """This module scan through all the clusters and identifies those that are not compatible with UC.
+    """Scan through all the clusters and identifies those that are not compatible with UC. The list of all the clusters
+    is stored in the`$inventory.clusters` table.
+
     It looks for:
-      - Clusters with DBR version earlier than 11.3
+      - Clusters with Databricks Runtime (DBR) version earlier than 11.3
       - Clusters using Passthrough Authentication
       - Clusters with incompatible spark config tags
       - Clusters referencing DBFS locations in one or more config options
-    Subsequently, the list of all the clusters is stored in the`$inventory.clusters` table."""
+    """
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     crawler = ClustersCrawler(ws, RuntimeBackend(), cfg.inventory_database)
     crawler.snapshot()
 
 
 @task("assessment", depends_on=[setup_schema])
 def crawl_permissions(cfg: WorkspaceConfig):
-    """As we commence the intricate migration process from Hive Metastore to the Databricks Unity Catalog, a critical
-    element of this transition is the thorough examination and preservation of permissions linked to a wide array of
-    Databricks Workspace components. These components encompass a broad spectrum of resources, including clusters,
-    cluster policies, jobs, models, experiments, SQL warehouses, SQL alerts, dashboards, queries, AWS IAM instance
-    profiles, and secret scopes. Ensuring the uninterrupted continuity of permissions is of paramount importance,
-    as it not only upholds data security but also facilitates a smooth and secure migration journey.
-
-    Our carefully designed procedure systematically scans and extracts permissions associated with these diverse
-    Databricks Workspace objects. This process encompasses rights granted to users and groups, role-based permissions,
-    custom access configurations, and any specialized policies governing resource access. The results of this
-    meticulous scan are methodically stored within the `$inventory.permissions` table, which serves as a central
-    repository for preserving and managing these crucial access control details."""
+    """Scans the workspace-local groups and all their permissions. The list is stored in the `$inventory.permissions`
+    Delta table.
+
+    This is the first step for the _group migration_ process, which is continued in the `migrate-groups` workflow."""
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     permission_manager = PermissionManager.factory(
         ws,
@@ -151,49 +139,17 @@ def crawl_permissions(cfg: WorkspaceConfig):
     dashboard="assessment",
 )
 def assessment_report(_: WorkspaceConfig):
-    """This meticulously prepared report serves the purpose of evaluating and gauging the preparedness of a specific
-    workspace for a smooth transition to the Unity Catalog.
-
-    Our assessment procedure involves a comprehensive examination of various critical elements, including data schemas,
-    metadata structures, permissions, access controls, data assets, and dependencies within the workspace. We dive deep
-    into the intricacies of the current environment, taking into account factors like the complexity of data models,
-    the intricacy of access control lists (ACLs), the existence of custom scripts, and the overall data ecosystem.
-
-    The outcome of this thorough assessment is a comprehensive report that offers a holistic perspective on the
-    workspace's readiness for migration to the Databricks Unity Catalog. This report serves as a valuable resource,
-    provides insights, recommendations, and practical steps to ensure a seamless and successful transition.
-    It assists data engineers, administrators, and decision-makers in making informed decisions, addressing potential
-    challenges, and optimizing the migration strategy.
-
-    Through the creation of this readiness assessment report, we demonstrate our commitment to a well-planned,
-    risk-mitigated migration process. It guarantees that our migration to the Databricks Unity Catalog is not only
-    efficient but also seamlessly aligns with our data governance, security, and operational requirements, paving the
-    way for a new era of excellence in data management."""
+    """Refreshes the assessment dashboard after all previous tasks have been completed. Note that you can access the
+    dashboard _before_ all tasks have been completed, but then only already completed information is shown."""
 
 
 @task("migrate-groups", depends_on=[crawl_permissions])
 def migrate_permissions(cfg: WorkspaceConfig):
-    """As we embark on the complex journey of migrating from Hive Metastore to the Databricks Unity Catalog,
-    a crucial phase in this transition involves the careful management of permissions.
-    This intricate process entails several key steps: first, applying permissions to designated backup groups;
-    second, smoothly substituting workspace groups with account groups;
-    and finally, applying permissions to these newly established account groups.
-
-    Throughout this meticulous process, we ensure that existing permissions are thoughtfully mapped to backup groups
-    to maintain robust and consistent data security and access control during the migration.
-
-    Concurrently, we gracefully replace workspace groups with account groups to align with the structure and policies
-    of the Databricks Unity Catalog.
-
-    Once this transition is complete, we diligently apply permissions to the newly formed account groups,
-    preserving the existing access control framework while facilitating the seamless integration of data assets into
-    the Unity Catalog environment.
-
-    This careful orchestration of permissions guarantees the continuity of data security, minimizes disruptions to data
-    workflows, and ensures a smooth migration experience for both users and administrators. By executing this precise
-    operation, we not only meet data security and governance standards but also enhance the overall efficiency and
-    manageability of our data ecosystem, laying the foundation for a new era of data management excellence within our
-    organization.
+    """Main phase of the group migration process. It does the following:
+      - Creates a backup of every workspace-local group, adding a prefix that can be set in the configuration
+      - Assigns the full set of permissions of the original group to the backup one
+      - Creates an account-level group with the original name of the workspace-local one
+      - Assigns the full set of permissions of the original group to the account-level one
 
     See [interactive tutorial here](https://app.getreprise.com/launch/myM3VNn/)."""
     ws = WorkspaceClient(config=cfg.to_databricks_config())
@@ -218,15 +174,17 @@ def migrate_permissions(cfg: WorkspaceConfig):
 
 @task("migrate-groups-cleanup", depends_on=[migrate_permissions])
 def delete_backup_groups(cfg: WorkspaceConfig):
-    """Removes workspace-level backup groups"""
+    """Last step of the group migration process. Removes all workspace-level backup groups, along with their
+    permissions."""
     ws = WorkspaceClient(config=cfg.to_databricks_config())
     group_manager = GroupManager(ws, cfg.groups)
     group_manager.delete_backup_groups()
 
 
 @task("destroy-schema")
 def destroy_schema(cfg: WorkspaceConfig):
-    """Removes the `$inventory` database"""
+    """This _clean-up_ workflow allows to removes the `$inventory` database, with all the inventory tables created by
+    the previous workflow runs. Use this to reset the entire state and start with the assessment step again."""
     RuntimeBackend().execute(f"DROP DATABASE {cfg.inventory_database} CASCADE")
 
 

diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py
@@ -324,3 +324,19 @@ def test_create_readme(mocker):
 
     p = re.compile(".*wl_1.*n3.*n1.*wl_2.*n2.*")
     assert p.match(str(args[1]))
+
+
+def test_replace_pydoc(mocker):
+    ws = mocker.Mock()
+    install = WorkspaceInstaller(ws)
+    doc = install._remove_extra_indentation(
+        """Test1
+        Test2
+    Test3"""
+    )
+    assert (
+        doc
+        == """Test1
+    Test2
+Test3"""
+    )