diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 4fdf564162410..7df007e087979 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -258,7 +258,7 @@ def allow_all(cls) -> "AllowDenyPattern": return AllowDenyPattern() def allowed(self, string: str) -> bool: - if self._denied(string): + if self.denied(string): return False return any( @@ -266,7 +266,7 @@ def allowed(self, string: str) -> bool: for allow_pattern in self.allow ) - def _denied(self, string: str) -> bool: + def denied(self, string: str) -> bool: for deny_pattern in self.deny: if re.match(deny_pattern, string, self.regex_flags): return True @@ -290,7 +290,7 @@ def get_allowed_list(self) -> List[str]: raise ValueError( "allow list must be fully specified to get list of allowed strings" ) - return [a for a in self.allow if not self._denied(a)] + return [a for a in self.allow if not self.denied(a)] def __eq__(self, other): # type: ignore return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 68c38d4d06461..6844b8a425a7b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -353,7 +353,7 @@ class TableauConfig( project_path_separator: str = Field( default="/", - description="The separator used for the project_pattern field between project names. By default, we use a slash. " + description="The separator used for the project_path_pattern field between project names. By default, we use a slash. " "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.", ) @@ -959,19 +959,36 @@ def _is_allowed_project(self, project: TableauProject) -> bool: return is_allowed def _is_denied_project(self, project: TableauProject) -> bool: - # Either project name or project path should exist in deny - for deny_pattern in self.config.project_pattern.deny: - # Either name or project path is denied - if re.match( - deny_pattern, project.name, self.config.project_pattern.regex_flags - ) or re.match( - deny_pattern, - self._get_project_path(project), - self.config.project_pattern.regex_flags, - ): - return True - logger.info(f"project({project.name}) is not denied as per project_pattern") - return False + """ + Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method? + + Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship: + + - **A** + - **B** (Child of A) + - **C** (Child of A) + - **D** + + In this setup: + + - `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`. + - `extract_project_hierarchy` is set to `True`. + + The goal is to extract assets from project A and its children while explicitly denying the child project B. + + If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested. + This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list. + However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied. + + To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules. + """ + + # Either project_pattern or project_path_pattern is set in a recipe + # TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured. + + return self.config.project_pattern.denied( + project.name + ) or self.config.project_path_pattern.denied(self._get_project_path(project)) def _init_tableau_project_registry(self, all_project_map: dict) -> None: list_of_skip_projects: List[TableauProject] = [] @@ -999,9 +1016,11 @@ def _init_tableau_project_registry(self, all_project_map: dict) -> None: for project in list_of_skip_projects: if ( project.parent_id in projects_to_ingest - and self._is_denied_project(project) is False + and not self._is_denied_project(project) ): - logger.debug(f"Project {project.name} is added in project registry") + logger.debug( + f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list" + ) projects_to_ingest[project.id] = project # We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.