diff --git a/.gitignore b/.gitignore index a1d94557d3..7639e5b67d 100644 --- a/.gitignore +++ b/.gitignore @@ -145,4 +145,7 @@ cython_debug/ # dev files and scratches dev/cleanup.py -Support \ No newline at end of file +Support + +.databricks +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 772a845766..9bae6fdba4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,30 @@ # UCX - Unity Catalog Migration Toolkit -This repo contains various functions and utilities for UC Upgrade. +Your best companion for enabling the Unity Catalog. + +## Installation + +The `./install.sh` script will guide you through installation process. Make sure you have Python 3.10 (or greater) +installed on your workstation, and you've configured authentication for +the [Databricks Workspace](https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html#default-authentication-flow). + +![install wizard](./examples/ucx-install.gif) + +The easiest way to install and authenticate is through a [Databricks configuration profile](https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication): + +```shell +export DATABRICKS_CONFIG_PROFILE=ABC +./install.sh +``` + +You can also specify environment variables in a more direct way, like in this example for installing +on a Azure Databricks Workspace using the Azure CLI authentication: + +```shell +az login +export DATABRICKS_HOST=https://adb-123....azuredatabricks.net/ +./install.sh +``` ## Latest working version and how-to diff --git a/bin/install.py b/bin/install.py deleted file mode 100644 index fa8575e2a5..0000000000 --- a/bin/install.py +++ /dev/null @@ -1,119 +0,0 @@ -import argparse -import logging -import os -import shutil -import subprocess -import sys -import tempfile -from io import BytesIO - -from databricks.sdk import WorkspaceClient -from databricks.sdk.core import DatabricksError -from databricks.sdk.service.workspace import ImportFormat - -from databricks.labs.ucx.logger import _install - -INSTALL_NOTEBOOK = """ -# Databricks notebook source -# MAGIC %md -# MAGIC # UCX - The UC Migration Toolkit -# MAGIC -# MAGIC This notebook installs `ucx` as a wheel package locally -# MAGIC and then restarts the Python interpreter. - -# COMMAND ---------- - -# MAGIC %pip install /Workspace{remote_wheel_file} -dbutils.library.restartPython() - -""" - -# install logging backend -_install() -logger = logging.getLogger(__name__) - -# parse command line parameters -parser = argparse.ArgumentParser(prog="ucx", description="Builds and installs ucx.") -parser.add_argument("--folder", "-f", default="ucx", help="name of folder in workspace, default: ucx") -parser.add_argument("--quiet", action="store_true", help="suppress extraneous information") -parser.add_argument("--debug", action="store_true", help="enable debug mode") -args = parser.parse_args() - -# adjust logging levels as needed -if args.debug: - logging.getLogger("databricks").setLevel("DEBUG") - - -def delete_local_dir(dir_name): - """Helper to delete a directory""" - try: - shutil.rmtree(dir_name) - except OSError as e: - logger.error(f"Error: {e.filename} - {e.strerror}.") - - -def folder_exists(folder_base, ws): - """Helper to check if a workspace folder exists""" - folder_files = [] - try: - for f in ws.workspace.list(folder_base): - folder_files.append(f.path) - logger.debug(f"Folder files: {folder_files}") - return True - except DatabricksError: - return False - - -def build_wheel(): - """Helper to build the wheel package""" - tmp_dir = tempfile.TemporaryDirectory() - logger.debug(f"Created temporary directory: {tmp_dir.name}") - streams = {} - if args.quiet: - streams = { - "stdout": subprocess.DEVNULL, - "stderr": subprocess.DEVNULL, - } - subprocess.run( - [sys.executable, "-m", "pip", "wheel", "--no-deps", "--wheel-dir", tmp_dir.name, ".."], **streams, check=True - ) - return tmp_dir.name - - -def upload_artifacts(folder_base, local_wheel_file, wheel_file_name, ws): - """Helper to upload artifacts into a workspace folder""" - remote_wheel_file = f"{folder_base}/{wheel_file_name}" - remote_notebook_file = f"{folder_base}/install_ucx.py" - logger.info(f"Remote wheel file: {remote_wheel_file}") - logger.info(f"Remote notebook file: {remote_notebook_file}") - logger.info("Uploading...") - ws.workspace.mkdirs(folder_base) - with open(local_wheel_file, "rb") as fh: - ws.workspace.upload(path=remote_wheel_file, content=fh.read(), format=ImportFormat.AUTO) - buf = BytesIO(INSTALL_NOTEBOOK.format(remote_wheel_file=remote_wheel_file).encode()) - ws.workspace.upload(path=remote_notebook_file, content=buf) - - -def main(): - # preflight check - ws = WorkspaceClient() - folder_base = f"/Users/{ws.current_user.me().user_name}/{args.folder}" - if folder_exists(folder_base, ws): - logger.error(f"ERROR: Remote folder '{folder_base}' already exists, aborting!") - sys.exit(-1) - # build wheel in temp directory - tmp_dir = build_wheel() - # get wheel name as first file in the temp directory - files = os.listdir(tmp_dir) - wheel_file_name = files[0] - local_wheel_file = tmp_dir + "/" + wheel_file_name - logger.info(f"Wheel file: {wheel_file_name}") - # upload wheel and starer notebook to workspace - upload_artifacts(folder_base, local_wheel_file, wheel_file_name, ws) - # cleanup - delete_local_dir(tmp_dir) - logger.info("DONE.") - - -if __name__ == "__main__": - main() diff --git a/examples/ucx-install.gif b/examples/ucx-install.gif new file mode 100644 index 0000000000..4024163dea Binary files /dev/null and b/examples/ucx-install.gif differ diff --git a/install.sh b/install.sh new file mode 100755 index 0000000000..c4de8b185f --- /dev/null +++ b/install.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# This script will eventually be replaced with `databricks labs install ucx` command. + +# Initialize an empty array to store Python 3 binary paths +python3_binaries=() + +# Split the $PATH variable into an array using ':' as the delimiter +IFS=':' read -ra path_dirs <<< "$PATH" + +# Iterate over each directory in the $PATH +for dir in "${path_dirs[@]}"; do + # Construct the full path to the python3 binary in the current directory + python3_path="${dir}/python3" + + # Check if the python3 binary exists and is executable + if [ -x "$python3_path" ]; then + python3_binaries+=("$python3_path") + fi +done + +if [ -z "${python3_binaries[*]}" ]; then + echo "[!] No Python binaries detected" + exit 1 +fi + +# Check versions for all Python binaries found +python_versions=() +for python_binary in "${python3_binaries[@]}"; do + python_version=$("$python_binary" --version | awk '{print $2}') + python_versions+=("$python_version -> $(realpath "$python_binary")") +done + +IFS=$'\n' python_versions=($(printf "%s\n" "${python_versions[@]}" | sort -V)) + +py="/dev/null" +for version_and_binary in "${python_versions[@]}"; do + echo "[i] found Python $version_and_binary" + IFS=" -> " read -ra parts <<< "$version_and_binary" + py="${parts[2]}" +done + +echo "[i] latest python is $py" + +tmp_dir=$(mktemp -d) + +# Create isolated Virtualenv with the latest Python version +# in the ephemeral temporary directory +$py -m venv "$tmp_dir" + +. "$tmp_dir/bin/activate" + +# Use the Python from Virtualenv +py="$tmp_dir/bin/python" + +echo "[+] installing dependencies within ephemeral Virtualenv: $tmp_dir" +# Install all project dependencies, so that installer can proceed +$py -m pip install --quiet -e . + +# Invoke python module of the install app directly, +# without console_scripts entrypoint +$py -m databricks.labs.ucx.cli.app install + +rm -r "$tmp_dir" diff --git a/src/databricks/labs/ucx/cli/app.py b/src/databricks/labs/ucx/cli/app.py index 8baf199abd..47f7525296 100644 --- a/src/databricks/labs/ucx/cli/app.py +++ b/src/databricks/labs/ucx/cli/app.py @@ -1,13 +1,30 @@ +import logging import os from pathlib import Path from typing import Annotated import typer +from databricks.sdk import WorkspaceClient from typer import Typer +from databricks.labs.ucx.__about__ import __version__ +from databricks.labs.ucx.logger import _install + +_install() +logging.root.setLevel("INFO") +logger = logging.getLogger(__name__) + app = Typer(name="UC Migration Toolkit", pretty_exceptions_show_locals=True) +@app.command() +def install(): + from databricks.labs.ucx.install import main + + ws = WorkspaceClient(product="ucx", product_version=__version__) + main(ws, verbose=False) + + @app.command() def migrate_groups(config_file: Annotated[Path, typer.Argument(help="Path to config file")] = "migration_config.yml"): from databricks.labs.ucx.config import MigrationConfig diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py index e5e2e625d6..94be937e56 100644 --- a/src/databricks/labs/ucx/config.py +++ b/src/databricks/labs/ucx/config.py @@ -82,6 +82,10 @@ def from_dict(cls, raw: dict): return cls(**raw) +# Used to set the right expectation about configuration file schema +_CONFIG_VERSION = 1 + + @dataclass class MigrationConfig: inventory_database: str @@ -112,10 +116,19 @@ def inner(x): return dict(result) return x - return inner(self) + serialized = inner(self) + serialized["version"] = _CONFIG_VERSION + return serialized @classmethod def from_dict(cls, raw: dict) -> "MigrationConfig": + stored_version = raw.get("version", None) + if stored_version != _CONFIG_VERSION: + msg = ( + f"Unsupported config version: {stored_version}. " + f"UCX v{__version__} expects config version to be {_CONFIG_VERSION}" + ) + raise ValueError(msg) return cls( inventory_database=raw.get("inventory_database"), tacl=TaclConfig.from_dict(raw.get("tacl", {})), diff --git a/src/databricks/labs/ucx/install/__init__.py b/src/databricks/labs/ucx/install/__init__.py new file mode 100644 index 0000000000..bfdbfed954 --- /dev/null +++ b/src/databricks/labs/ucx/install/__init__.py @@ -0,0 +1,138 @@ +import logging +import subprocess +import sys +import tempfile +import webbrowser +from io import BytesIO +from pathlib import Path + +import yaml +from databricks.sdk import WorkspaceClient +from databricks.sdk.core import DatabricksError +from databricks.sdk.service.workspace import ImportFormat + +from databricks.labs.ucx.config import GroupsConfig, MigrationConfig, TaclConfig + +INSTALL_NOTEBOOK = """ +# Databricks notebook source +# MAGIC %md +# MAGIC # UCX - The UC Migration Toolkit +# MAGIC +# MAGIC This notebook installs `ucx` as a wheel package locally +# MAGIC and then restarts the Python interpreter. + +# COMMAND ---------- + +# MAGIC %pip install /Workspace{remote_wheel_file} +dbutils.library.restartPython() + +""" + +logger = logging.getLogger(__name__) + + +def main(ws: WorkspaceClient, *, verbose: bool = False): + folder_base = f"/Users/{ws.current_user.me().user_name}/.ucx" + # create configuration file only if this installer is called for the first time, + # otherwise just open file in the browser + save_config(ws, folder_base) + with tempfile.TemporaryDirectory() as tmp_dir: + logger.debug(f"Created temporary directory: {tmp_dir}") + # build wheel in temp directory + wheel_file = build_wheel(tmp_dir, verbose=verbose) + logger.info(f"Wheel file: {wheel_file}") + # (re)upload wheel and starer notebook to workspace + upload_artifacts(ws, folder_base, wheel_file) + logger.info("DONE.") + + +def save_config(ws: WorkspaceClient, folder_base: str): + config_path = f"{folder_base}/config.yml" + ws_file_url = f"{ws.config.host}/#workspace{config_path}" + try: + ws.workspace.get_status(config_path) + logger.info(f"UCX is already configured. See {ws_file_url}") + if question("Open config file in the browser and continue installing?", default="yes") == "yes": + webbrowser.open(ws_file_url) + return config_path + except DatabricksError as err: + if err.error_code != "RESOURCE_DOES_NOT_EXIST": + raise err + + logger.info("Please answer a couple of questions to configure Unity Catalog migration") + + config = MigrationConfig( + inventory_database=question("Inventory Database", default="ucx"), + groups=GroupsConfig( + selected=question("Comma-separated list of workspace group names to migrate").split(","), + backup_group_prefix=question("Backup prefix", default="db-temp-"), + ), + tacl=TaclConfig(auto=True), + log_level=question("Log level", default="INFO"), + num_threads=int(question("Number of threads", default="8")), + ) + ws.workspace.upload(config_path, yaml.dump(config.as_dict()).encode("utf8"), format=ImportFormat.AUTO) + logger.info(f"Created configuration file: {config_path}") + if question("Open config file in the browser and continue installing?", default="yes") == "yes": + webbrowser.open(ws_file_url) + + +def build_wheel(tmp_dir: str, *, verbose: bool = False): + """Helper to build the wheel package""" + streams = {} + if not verbose: + streams = { + "stdout": subprocess.DEVNULL, + "stderr": subprocess.DEVNULL, + } + project_root = find_project_root(Path(__file__)) + if not project_root: + msg = "Cannot find project root" + raise NotADirectoryError(msg) + subprocess.run( + [sys.executable, "-m", "pip", "wheel", "--no-deps", "--wheel-dir", tmp_dir, project_root], **streams, check=True + ) + # get wheel name as first file in the temp directory + return next(Path(tmp_dir).glob("*.whl")) + + +def upload_artifacts(ws: WorkspaceClient, folder_base, local_wheel: Path): + """Helper to upload artifacts into a workspace folder""" + remote_wheel_file = f"{folder_base}/{local_wheel.name}" + remote_notebook_file = f"{folder_base}/install_ucx.py" + logger.info(f"Remote wheel file: {remote_wheel_file}") + logger.info(f"Remote notebook file: {remote_notebook_file}") + logger.info("Uploading...") + ws.workspace.mkdirs(folder_base) + with local_wheel.open("rb") as fh: + ws.workspace.upload(remote_wheel_file, fh, format=ImportFormat.AUTO, overwrite=True) + buf = BytesIO(INSTALL_NOTEBOOK.format(remote_wheel_file=remote_wheel_file).encode()) + ws.workspace.upload(remote_notebook_file, buf, overwrite=True) + + +def find_dir_with_leaf(folder: Path, leaf: str) -> Path | None: + root = folder.root + while str(folder.absolute()) != root: + if (folder / leaf).exists(): + return folder + folder = folder.parent + return None + + +def find_project_root(folder: Path) -> Path | None: + for leaf in ["pyproject.toml", "setup.py"]: + root = find_dir_with_leaf(folder, leaf) + if root is not None: + return root + return None + + +def question(text: str, *, default: str | None = None) -> str: + default_help = "" if default is None else f"\033[36m (default: {default})\033[0m" + prompt = f"\033[1m{text}{default_help}: \033[0m" + res = None + while not res: + res = input(prompt) + if not res and default is not None: + return default + return res diff --git a/src/databricks/labs/ucx/toolkits/group_migration.py b/src/databricks/labs/ucx/toolkits/group_migration.py index b29e1a78a3..d3343e9029 100644 --- a/src/databricks/labs/ucx/toolkits/group_migration.py +++ b/src/databricks/labs/ucx/toolkits/group_migration.py @@ -5,7 +5,9 @@ from databricks.labs.ucx.config import MigrationConfig from databricks.labs.ucx.inventory.inventorizer import Inventorizers from databricks.labs.ucx.inventory.permissions import PermissionManager -from databricks.labs.ucx.inventory.permissions_inventory import PermissionsInventoryTable +from databricks.labs.ucx.inventory.permissions_inventory import ( + PermissionsInventoryTable, +) from databricks.labs.ucx.managers.group import GroupManager diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py new file mode 100644 index 0000000000..4b6b3bb3a0 --- /dev/null +++ b/tests/unit/test_install.py @@ -0,0 +1,56 @@ +import os.path + +from databricks.sdk.core import DatabricksError +from databricks.sdk.service import iam +from databricks.sdk.service.workspace import ImportFormat + +from databricks.labs.ucx import install + + +def test_build_wheel(tmp_path): + whl = install.build_wheel(str(tmp_path)) + assert os.path.exists(whl) + + +def test_save_config(mocker): + def not_found(_): + raise DatabricksError(error_code="RESOURCE_DOES_NOT_EXIST") + + mocker.patch("builtins.input", return_value="42") + ws = mocker.Mock() + ws.config.host = "https://foo" + ws.workspace.get_status = not_found + + install.save_config(ws, "abc") + + ws.workspace.upload.assert_called_with( + "abc/config.yml", + b"""groups: + backup_group_prefix: '42' + selected: + - '42' +inventory_database: '42' +log_level: '42' +num_threads: 42 +tacl: + auto: true +version: 1 +workspace_start_path: / +""", + format=ImportFormat.AUTO, + ) + + +def test_main_with_existing_conf_does_not_recreate_config(mocker): + mocker.patch("builtins.input", return_value="yes") + webbrowser_open = mocker.patch("webbrowser.open") + ws = mocker.patch("databricks.sdk.WorkspaceClient.__init__") + + ws.current_user.me = lambda: iam.User(user_name="me@example.com") + ws.config.host = "https://foo" + ws.workspace.get_status = lambda _: None + + install.main(ws) + + webbrowser_open.assert_called_with("https://foo/#workspace/Users/me@example.com/.ucx/config.yml") + ws.workspace.mkdirs.assert_called_with("/Users/me@example.com/.ucx")