Skip to content

Commit

Permalink
Improved documentation for readme notebook (#257)
Browse files Browse the repository at this point in the history
Markdown handles multiple whitespaces well anyways, and leaving in the
indentation allows for formatting list etc.

Fixes #253
  • Loading branch information
larsgeorge-db authored and FastLee committed Oct 25, 2023
1 parent 1963796 commit 10ea553
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 101 deletions.
29 changes: 22 additions & 7 deletions src/databricks/labs/ucx/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,12 +328,23 @@ def _step_list(cls) -> list[str]:
step_list.append(task.workflow)
return step_list

@staticmethod
def _remove_extra_indentation(doc: str) -> str:
lines = doc.splitlines()
stripped = []
for line in lines:
if line.startswith(" " * 4):
stripped.append(line[4:])
else:
stripped.append(line)
return "\n".join(stripped)

def _create_readme(self):
md = [
"# UCX - The Unity Catalog Migration Assistant",
f'To troubleshoot, see [debug notebook]({self._notebook_link(f"{self._install_folder}/DEBUG.py")}).\n',
"Here are the URL and descriptions of jobs that trigger's various stages of migration.",
"All jobs are defined with necessary cluster configurations and DBR versions.",
"Here are the URLs and descriptions of workflows that trigger various stages of migration.",
"All jobs are defined with necessary cluster configurations and DBR versions.\n",
]
for step_name in self._step_list():
if step_name not in self._deployed_steps:
Expand All @@ -343,16 +354,20 @@ def _create_readme(self):
dashboard_link = ""
if step_name in self._dashboards:
dashboard_link = f"{self._ws.config.host}/sql/dashboards/{self._dashboards[step_name]}"
dashboard_link = f" (see [{step_name} dashboard]({dashboard_link}) after finish)"
dashboard_link = f"Go to the [{step_name} dashboard]({dashboard_link}) after running the jobs."
job_link = f"[{self._name(step_name)}]({self._ws.config.host}#job/{job_id})"
md.append(f"## {job_link}{dashboard_link}\n")
md.append("---\n\n")
md.append(f"## {job_link}\n\n")
md.append(f"{dashboard_link}\n\n")
md.append("The workflow consists of the following separate tasks:\n\n")
for t in self._sorted_tasks():
if t.workflow != step_name:
continue
doc = re.sub(r"\s+", " ", t.doc)
doc = self._remove_extra_indentation(t.doc)
doc = self._replace_inventory_variable(doc)
md.append(f" - `{t.name}`: {doc}")
md.append("")
md.append(f"### `{t.name}`\n\n")
md.append(f"{doc}\n")
md.append("\n\n")
preamble = ["# Databricks notebook source", "# MAGIC %md"]
intro = "\n".join(preamble + [f"# MAGIC {line}" for line in md])
path = f"{self._install_folder}/README.py"
Expand Down
146 changes: 52 additions & 94 deletions src/databricks/labs/ucx/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,41 +19,36 @@

@task("assessment")
def setup_schema(cfg: WorkspaceConfig):
"""Creates a database for UCX migration intermediate state"""
"""Creates a database for the UCX migration intermediate state. The name comes from the configuration file
and is set with the `inventory_database` key."""
backend = RuntimeBackend()
backend.execute(f"CREATE SCHEMA IF NOT EXISTS hive_metastore.{cfg.inventory_database}")


@task("assessment", depends_on=[setup_schema], notebook="hive_metastore/tables.scala")
def crawl_tables(_: WorkspaceConfig):
"""In this procedure, we systematically scan every table stored within the Hive Metastore. This scanning process
retrieves vital information for each table, which includes its distinct identifier or name, table format, and
storage location details.
The gathered metadata is then subsequently organized and documented within a designated storage entity referred to
as the `$inventory.tables` table. This table serves as an extensive inventory, offering a well-structured and
readily accessible point of reference for users, data engineers, and administrators."""
"""Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such
as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named
`${inventory_database}.tables`. The `inventory_database` placeholder is set in the configuration file. The metadata
stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
cannot easily be migrated to Unity Catalog."""


@task("assessment", job_cluster="tacl")
def setup_tacl(_: WorkspaceConfig):
"""(Optimization) Starts tacl job cluster in parallel to crawling tables"""
"""(Optimization) Starts `tacl` job cluster in parallel to crawling tables."""


@task("assessment", depends_on=[crawl_tables, setup_tacl], job_cluster="tacl")
def crawl_grants(cfg: WorkspaceConfig):
"""During this process, our methodology is purposefully designed to systematically scan and retrieve ACLs
(Access Control Lists) associated with Legacy Tables from the Hive Metastore. These ACLs encompass comprehensive
information, including permissions for users and groups, role-based access settings, and any custom access
configurations. These ACLs are then thoughtfully structured and securely stored within the `$inventory.grants`
table. This dedicated table serves as a central repository, ensuring the uninterrupted preservation of access
control data as we transition to the Databricks Unity Catalog.
By meticulously migrating these Legacy Table ACLs, we guarantee the seamless transfer of the data governance and
security framework established in our legacy Hive Metastore environment to our new Databricks Unity Catalog
setup. This approach not only safeguards data integrity and access control but also ensures a smooth and
secure transition for our data assets. It reinforces our commitment to data security and compliance throughout the
migration process and beyond"""
"""Scans the previously created Delta table named `${inventory_database}.tables` and issues a `SHOW GRANTS`
statement for every object to retrieve the permissions it has assigned to it. The permissions include information
such as the _principal_, _action type_, and the _table_ it applies to. This is persisted in the Delta table
`${inventory_database}.grants`. Other, migration related jobs use this inventory table to convert the legacy Table
ACLs to Unity Catalog permissions.
Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
ACLs enabled and available for retrieval."""
backend = RuntimeBackend()
tables = TablesCrawler(backend, cfg.inventory_database)
grants = GrantsCrawler(tables)
Expand All @@ -62,77 +57,70 @@ def crawl_grants(cfg: WorkspaceConfig):

@task("assessment", depends_on=[setup_schema])
def crawl_mounts(cfg: WorkspaceConfig):
"""In this segment of the assessment, we will define the scope of the mount points intended for migration into the
Unity Catalog. As these objects are not compatible with the Unity Catalog paradigm, a key component of the
migration process involves transferring them to Unity Catalog External Locations.
"""Defines the scope of the _mount points_ intended for migration into Unity Catalog. As these objects are not
compatible with the Unity Catalog paradigm, a key component of the migration process involves transferring them
to Unity Catalog External Locations.
The assessment involves scanning the workspace to compile a list of all existing mount points and subsequently
storing this information in the `$inventory.mounts` table. This step enables you to create a snapshot of your
current Mount Point infrastructure, which is crucial for planning the migration."""
storing this information in the `$inventory.mounts` table. This is crucial for planning the migration."""
ws = WorkspaceClient(config=cfg.to_databricks_config())
mounts = Mounts(backend=RuntimeBackend(), ws=ws, inventory_database=cfg.inventory_database)
mounts.inventorize_mounts()


@task("assessment", depends_on=[crawl_mounts, crawl_tables])
def guess_external_locations(cfg: WorkspaceConfig):
"""In this section of the assessment, our objective is to determine the whereabouts of all the tables.
Specifically, we will focus on identifying locations that utilize Mount Points. Our goal is to identify the
External Locations necessary for a successful migration and store this information in the
`$inventory.external_locations` Table.
"""Determines the shared path prefixes of all the tables. Specifically, the focus is on identifying locations that
utilize mount points. The goal is to identify the _external locations_ necessary for a successful migration and
store this information in the `$inventory.external_locations` table.
The approach taken in this assessment involves the following steps:
- Extracting all the locations associated with tables that do not use DBFS (with a focus on those
using mount points).
- Scanning all these locations to identify common folders that can accommodate them.
- These identified external locations will be created subsequently prior to the actual table migration"""
- Extracting all the locations associated with tables that do not use DBFS directly, but a mount point instead
- Scanning all these locations to identify folders that can act as shared path prefixes
- These identified external locations will be created subsequently prior to the actual table migration"""
ws = WorkspaceClient(config=cfg.to_databricks_config())
crawler = ExternalLocationCrawler(ws, RuntimeBackend(), cfg.inventory_database)
crawler.snapshot()


@task("assessment", depends_on=[setup_schema])
def assess_jobs(cfg: WorkspaceConfig):
"""This module scans through all the jobs and identifies those that are not compatible with UC.
"""Scans through all the jobs and identifies those that are not compatible with UC. The list of all the jobs is
stored in the `$inventory.jobs` table.
It looks for:
- Clusters with DBR version earlier than 11.3
- Clusters with Databricks Runtime (DBR) version earlier than 11.3
- Clusters using Passthrough Authentication
- Clusters with incompatible spark config tags
- Clusters with incompatible Spark config tags
- Clusters referencing DBFS locations in one or more config options
Subsequently, the list of all the jobs is stored in the `$inventory.jobs` table."""
"""
ws = WorkspaceClient(config=cfg.to_databricks_config())
crawler = JobsCrawler(ws, RuntimeBackend(), cfg.inventory_database)
crawler.snapshot()


@task("assessment", depends_on=[setup_schema])
def assess_clusters(cfg: WorkspaceConfig):
"""This module scan through all the clusters and identifies those that are not compatible with UC.
"""Scan through all the clusters and identifies those that are not compatible with UC. The list of all the clusters
is stored in the`$inventory.clusters` table.
It looks for:
- Clusters with DBR version earlier than 11.3
- Clusters with Databricks Runtime (DBR) version earlier than 11.3
- Clusters using Passthrough Authentication
- Clusters with incompatible spark config tags
- Clusters referencing DBFS locations in one or more config options
Subsequently, the list of all the clusters is stored in the`$inventory.clusters` table."""
"""
ws = WorkspaceClient(config=cfg.to_databricks_config())
crawler = ClustersCrawler(ws, RuntimeBackend(), cfg.inventory_database)
crawler.snapshot()


@task("assessment", depends_on=[setup_schema])
def crawl_permissions(cfg: WorkspaceConfig):
"""As we commence the intricate migration process from Hive Metastore to the Databricks Unity Catalog, a critical
element of this transition is the thorough examination and preservation of permissions linked to a wide array of
Databricks Workspace components. These components encompass a broad spectrum of resources, including clusters,
cluster policies, jobs, models, experiments, SQL warehouses, SQL alerts, dashboards, queries, AWS IAM instance
profiles, and secret scopes. Ensuring the uninterrupted continuity of permissions is of paramount importance,
as it not only upholds data security but also facilitates a smooth and secure migration journey.
Our carefully designed procedure systematically scans and extracts permissions associated with these diverse
Databricks Workspace objects. This process encompasses rights granted to users and groups, role-based permissions,
custom access configurations, and any specialized policies governing resource access. The results of this
meticulous scan are methodically stored within the `$inventory.permissions` table, which serves as a central
repository for preserving and managing these crucial access control details."""
"""Scans the workspace-local groups and all their permissions. The list is stored in the `$inventory.permissions`
Delta table.
This is the first step for the _group migration_ process, which is continued in the `migrate-groups` workflow."""
ws = WorkspaceClient(config=cfg.to_databricks_config())
permission_manager = PermissionManager.factory(
ws,
Expand All @@ -151,49 +139,17 @@ def crawl_permissions(cfg: WorkspaceConfig):
dashboard="assessment",
)
def assessment_report(_: WorkspaceConfig):
"""This meticulously prepared report serves the purpose of evaluating and gauging the preparedness of a specific
workspace for a smooth transition to the Unity Catalog.
Our assessment procedure involves a comprehensive examination of various critical elements, including data schemas,
metadata structures, permissions, access controls, data assets, and dependencies within the workspace. We dive deep
into the intricacies of the current environment, taking into account factors like the complexity of data models,
the intricacy of access control lists (ACLs), the existence of custom scripts, and the overall data ecosystem.
The outcome of this thorough assessment is a comprehensive report that offers a holistic perspective on the
workspace's readiness for migration to the Databricks Unity Catalog. This report serves as a valuable resource,
provides insights, recommendations, and practical steps to ensure a seamless and successful transition.
It assists data engineers, administrators, and decision-makers in making informed decisions, addressing potential
challenges, and optimizing the migration strategy.
Through the creation of this readiness assessment report, we demonstrate our commitment to a well-planned,
risk-mitigated migration process. It guarantees that our migration to the Databricks Unity Catalog is not only
efficient but also seamlessly aligns with our data governance, security, and operational requirements, paving the
way for a new era of excellence in data management."""
"""Refreshes the assessment dashboard after all previous tasks have been completed. Note that you can access the
dashboard _before_ all tasks have been completed, but then only already completed information is shown."""


@task("migrate-groups", depends_on=[crawl_permissions])
def migrate_permissions(cfg: WorkspaceConfig):
"""As we embark on the complex journey of migrating from Hive Metastore to the Databricks Unity Catalog,
a crucial phase in this transition involves the careful management of permissions.
This intricate process entails several key steps: first, applying permissions to designated backup groups;
second, smoothly substituting workspace groups with account groups;
and finally, applying permissions to these newly established account groups.
Throughout this meticulous process, we ensure that existing permissions are thoughtfully mapped to backup groups
to maintain robust and consistent data security and access control during the migration.
Concurrently, we gracefully replace workspace groups with account groups to align with the structure and policies
of the Databricks Unity Catalog.
Once this transition is complete, we diligently apply permissions to the newly formed account groups,
preserving the existing access control framework while facilitating the seamless integration of data assets into
the Unity Catalog environment.
This careful orchestration of permissions guarantees the continuity of data security, minimizes disruptions to data
workflows, and ensures a smooth migration experience for both users and administrators. By executing this precise
operation, we not only meet data security and governance standards but also enhance the overall efficiency and
manageability of our data ecosystem, laying the foundation for a new era of data management excellence within our
organization.
"""Main phase of the group migration process. It does the following:
- Creates a backup of every workspace-local group, adding a prefix that can be set in the configuration
- Assigns the full set of permissions of the original group to the backup one
- Creates an account-level group with the original name of the workspace-local one
- Assigns the full set of permissions of the original group to the account-level one
See [interactive tutorial here](https://app.getreprise.com/launch/myM3VNn/)."""
ws = WorkspaceClient(config=cfg.to_databricks_config())
Expand All @@ -218,15 +174,17 @@ def migrate_permissions(cfg: WorkspaceConfig):

@task("migrate-groups-cleanup", depends_on=[migrate_permissions])
def delete_backup_groups(cfg: WorkspaceConfig):
"""Removes workspace-level backup groups"""
"""Last step of the group migration process. Removes all workspace-level backup groups, along with their
permissions."""
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(ws, cfg.groups)
group_manager.delete_backup_groups()


@task("destroy-schema")
def destroy_schema(cfg: WorkspaceConfig):
"""Removes the `$inventory` database"""
"""This _clean-up_ workflow allows to removes the `$inventory` database, with all the inventory tables created by
the previous workflow runs. Use this to reset the entire state and start with the assessment step again."""
RuntimeBackend().execute(f"DROP DATABASE {cfg.inventory_database} CASCADE")


Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,19 @@ def test_create_readme(mocker):

p = re.compile(".*wl_1.*n3.*n1.*wl_2.*n2.*")
assert p.match(str(args[1]))


def test_replace_pydoc(mocker):
ws = mocker.Mock()
install = WorkspaceInstaller(ws)
doc = install._remove_extra_indentation(
"""Test1
Test2
Test3"""
)
assert (
doc
== """Test1
Test2
Test3"""
)

0 comments on commit 10ea553

Please sign in to comment.