Optimisation: when detecting if a file is a notebook only read the st…

…art instead of the whole file (#2390) ## Changes This PR updates the detection during linting of whether a file is a notebook or not for non-Workspace paths so that when performing the header-check we only read the start of the file (enough for the header) instead of loading it all. This is an optimisation rather than an issue of correctness. ### Tests - existing unit tests
databrickslabs · Aug 12, 2024 · 2f3a4a9 · 2f3a4a9
1 parent 3a87cb9
commit 2f3a4a9
Showing 1 changed file with 9 additions and 7 deletions.
diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
@@ -293,11 +293,13 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
     language = file_language(path)
     if not language:
         return False
-    if content is None:
-        try:
-            content = path.read_text(guess_encoding(path))
-        except (FileNotFoundError, UnicodeDecodeError, PermissionError):
-            logger.warning(f"Could not read file {path}")
-            return False
     magic_header = f"{LANGUAGE_COMMENT_PREFIXES.get(language)} {NOTEBOOK_HEADER}"
-    return content.startswith(magic_header)
+    if content is not None:
+        return content.startswith(magic_header)
+    try:
+        with path.open('rt', encoding=guess_encoding(path)) as f:
+            file_header = f.read(len(magic_header))
+    except (FileNotFoundError, UnicodeDecodeError, PermissionError):
+        logger.warning(f"Could not read file {path}")
+        return False
+    return file_header == magic_header