diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index b56bdc544d..e9d5ded9ba 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -28,7 +28,12 @@ class Matcher(ABC): session_state: CurrentSessionState | None = None def matches(self, node: NodeNG): - return isinstance(node, Call) and isinstance(node.func, Attribute) and self._get_table_arg(node) is not None + return ( + isinstance(node, Call) + and self._get_table_arg(node) is not None + and isinstance(node.func, Attribute) + and Tree(node.func.expr).is_from_module("spark") + ) @abstractmethod def lint( diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 704a17293f..c676ced966 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -8,6 +8,7 @@ from astroid import ( # type: ignore Assign, + AssignName, Attribute, Call, Const, @@ -223,6 +224,27 @@ def append_statements(self, tree: Tree) -> Tree: # the following may seem strange but it's actually ok to use the original module as tree root return tree + def is_from_module(self, module_name: str): + # if his is the call's root node, check it against the required module + if isinstance(self._node, Name): + if self._node.name == module_name: + return True + root = self.root + if not isinstance(root, Module): + return False + for value in root.globals.get(self._node.name, []): + if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): + continue + if Tree(value.parent.value).is_from_module(module_name): + return True + return False + # walk up intermediate calls such as spark.range(...) + if isinstance(self._node, Call): + return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name) + if isinstance(self._node, Attribute): + return Tree(self._node.expr).is_from_module(module_name) + return False + class TreeVisitor: @@ -282,6 +304,8 @@ def visit_importfrom(self, node: ImportFrom): def _matches(self, node: NodeNG, depth: int): if depth >= len(self._match_nodes): return False + if isinstance(node, Call): + return self._matches(node.func, depth) name, match_node = self._match_nodes[depth] if not isinstance(node, match_node): return False diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 147c72e30c..ad45ec0006 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -50,7 +50,11 @@ def get_advice_span(self, node: NodeNG) -> Range | None: if call_args_count < self.min_args or call_args_count > self.max_args: return None - # Check 3: check presence of the format specifier: + # Check 3: ensure this is a spark call + if not Tree(node.func.expr).is_from_module("spark"): + return None + + # Check 4: check presence of the format specifier: # Option A: format specifier may be given as a direct parameter to the table-creating call # >>> df.saveToTable("c.db.table", format="csv") format_arg = Tree.get_arg(node, self.format_arg_index, self.format_arg_name) diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index 86a2596b18..cade4e976f 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -162,7 +162,9 @@ def test_folder_has_repr(): @pytest.mark.skip("Manual testing for troubleshooting") -@pytest.mark.parametrize("path", [(Path(site_packages, "mypy", "build.py"))]) +@pytest.mark.parametrize( + "path", [Path("/Users/eric.vergnaud/development/ucx/.venv/lib/python3.10/site-packages/spacy/pipe_analysis.py")] +) def test_known_issues(path: Path, migration_index): file_loader = FileLoader() folder_loader = FolderLoader(file_loader) @@ -181,4 +183,6 @@ def test_known_issues(path: Path, migration_index): resolver, lambda: LinterContext(migration_index, session_state), ) - linter.lint(MockPrompts({}), path) + advices = linter.lint(MockPrompts({}), path) + for advice in advices: + print(repr(advice)) diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index 47569c7321..16bdf6d34c 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -1,5 +1,5 @@ import pytest -from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr # type: ignore +from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr, Name # type: ignore from databricks.labs.ucx.source_code.linters.python_ast import Tree from databricks.labs.ucx.source_code.linters.python_infer import InferredValue @@ -147,3 +147,15 @@ def test_appends_statements(): values = list(InferredValue.infer_from_node(tree.node)) strings = list(value.as_string() for value in values) assert strings == ["Hello John!"] + + +def test_is_from_module(): + source = """ +df = spark.read.csv("hi") +df.write.format("delta").saveAsTable("old.things") +""" + tree = Tree.normalize_and_parse(source) + save_call = tree.locate( + Call, [("saveAsTable", Attribute), ("format", Attribute), ("write", Attribute), ("df", Name)] + )[0] + assert Tree(save_call).is_from_module("spark") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py index 82c82cb613..5cf6d4912b 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py @@ -19,7 +19,5 @@ spark.catalog.cacheTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. -# FIXME: This is a false positive; any method named 'cacheTable' is triggering the warning. -# ucx[table-migrate:+1:0:+1:39] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.cacheTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py index 04aef1841d..890982c8c4 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py @@ -35,7 +35,5 @@ do_stuff_with(df) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'createExternalTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:52] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.createExternalTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py index 695c9de5a8..07bb7e5972 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py @@ -35,7 +35,5 @@ do_stuff_with(df) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'createTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.createTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py index 8d42b21524..f4e7171651 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py @@ -24,7 +24,5 @@ do_stuff_with(table) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'getTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:41] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.getTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py index b4e5f19d9b..83eb627b12 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py @@ -19,7 +19,5 @@ cached_previously = spark.catalog.isCached(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'isCached' is triggering the warning. - # ucx[table-migrate:+1:4:+1:41] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.isCached("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py index e0f86e1e16..43d69c2f07 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py @@ -29,7 +29,5 @@ columns = spark.catalog.listColumns(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'listColumns' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.listColumns("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py index 8312fc974b..08cd982e5e 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py @@ -19,7 +19,5 @@ spark.catalog.recoverPartitions(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'recoverPartitions' is triggering the warning. - # ucx[table-migrate:+1:4:+1:50] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.recoverPartitions("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py index 7d1e043be9..8fabfedd01 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py @@ -19,7 +19,5 @@ spark.catalog.refreshTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'refreshTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:45] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.refreshTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py index 625f070822..6aee348d91 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py @@ -34,7 +34,5 @@ pass ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'tableExists' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.tableExists("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py index 636bd8c942..63cf7a18b9 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py @@ -19,7 +19,5 @@ spark.catalog.uncacheTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'uncacheTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:45] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.uncacheTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py index c27b91d673..2b8abe77b9 100644 --- a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py +++ b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py @@ -23,7 +23,5 @@ df.write.insertInto(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'insertInto' is triggering the warning. - # ucx[table-migrate:+1:4:+1:43] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.insertInto("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py index 5b0017c5ef..0830629a96 100644 --- a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py @@ -23,8 +23,5 @@ df.write.format("delta").saveAsTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This are false positives; any method named 'saveAsTable' is triggering the warnings. - # ucx[table-migrate:+2:4:+2:44] Table old.things is migrated to brand.new.stuff in Unity Catalog - # ucx[table-migrate:+1:4:+1:44] The default format changed in Databricks Runtime 8.0, from Parquet to Delta something_else.saveAsTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/spark-table.py b/tests/unit/source_code/samples/functional/pyspark/spark-table.py index 461569410e..d82cb728c2 100644 --- a/tests/unit/source_code/samples/functional/pyspark/spark-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/spark-table.py @@ -31,9 +31,6 @@ df = spark.table(f"boop{stuff}") do_stuff_with(df) - ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: These are false positives; any method named 'table' is triggering the warnings. - # ucx[table-migrate:+2:4:+2:38] Table old.things is migrated to brand.new.stuff in Unity Catalog - # ucx[table-migrate:+1:4:+1:38] The default format changed in Databricks Runtime 8.0, from Parquet to Delta + ## Some trivial references to the method or table in unrelated contexts that should not trigger warnings. something_else.table("old.things") a_function("old.things")