forked from datahub-project/datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ingest): simplify more stateful ingestion state (datahub-project…
- Loading branch information
Showing
14 changed files
with
111 additions
and
169 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 0 additions & 45 deletions
45
metadata-ingestion/src/datahub/ingestion/source/state/iceberg_state.py
This file was deleted.
Oops, something went wrong.
69 changes: 10 additions & 59 deletions
69
metadata-ingestion/src/datahub/ingestion/source/state/kafka_state.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,17 @@ | ||
from typing import Iterable, List | ||
|
||
import pydantic | ||
|
||
from datahub.emitter.mce_builder import dataset_urn_to_key, make_dataset_urn | ||
from datahub.ingestion.source.state.stale_entity_removal_handler import ( | ||
StaleEntityCheckpointStateBase, | ||
from datahub.ingestion.source.state.entity_removal_state import ( | ||
GenericCheckpointState, | ||
pydantic_state_migrator, | ||
) | ||
|
||
|
||
class KafkaCheckpointState(StaleEntityCheckpointStateBase["KafkaCheckpointState"]): | ||
class KafkaCheckpointState(GenericCheckpointState): | ||
""" | ||
This Class represents the checkpoint state for Kafka based sources. | ||
This class represents the checkpoint state for Kafka based sources. | ||
Stores all the topics being ingested and it is used to remove any stale entities. | ||
""" | ||
|
||
encoded_topic_urns: List[str] = pydantic.Field(default_factory=list) | ||
|
||
@classmethod | ||
def get_supported_types(cls) -> List[str]: | ||
return ["topic"] | ||
|
||
@staticmethod | ||
def _get_separator() -> str: | ||
# Unique small string not allowed in URNs. | ||
return "||" | ||
|
||
@staticmethod | ||
def _get_lightweight_repr(dataset_urn: str) -> str: | ||
"""Reduces the amount of text in the URNs for smaller state footprint.""" | ||
SEP = KafkaCheckpointState._get_separator() | ||
key = dataset_urn_to_key(dataset_urn) | ||
assert key is not None | ||
return f"{key.platform}{SEP}{key.name}{SEP}{key.origin}" | ||
|
||
@staticmethod | ||
def _get_urns_not_in( | ||
encoded_urns_1: List[str], encoded_urns_2: List[str] | ||
) -> Iterable[str]: | ||
difference = set(encoded_urns_1) - set(encoded_urns_2) | ||
for encoded_urn in difference: | ||
platform, name, env = encoded_urn.split( | ||
KafkaCheckpointState._get_separator() | ||
) | ||
yield make_dataset_urn(platform, name, env) | ||
|
||
def add_checkpoint_urn(self, type: str, urn: str) -> None: | ||
assert type in self.get_supported_types() | ||
if type == "topic": | ||
self.encoded_topic_urns.append(self._get_lightweight_repr(urn)) | ||
|
||
def get_urns_not_in( | ||
self, type: str, other_checkpoint_state: "KafkaCheckpointState" | ||
) -> Iterable[str]: | ||
assert type in self.get_supported_types() | ||
if type == "topic": | ||
yield from self._get_urns_not_in( | ||
self.encoded_topic_urns, other_checkpoint_state.encoded_topic_urns | ||
) | ||
|
||
def get_percent_entities_changed( | ||
self, old_checkpoint_state: "KafkaCheckpointState" | ||
) -> float: | ||
return StaleEntityCheckpointStateBase.compute_percent_entities_changed( | ||
[(self.encoded_topic_urns, old_checkpoint_state.encoded_topic_urns)] | ||
) | ||
_migration = pydantic_state_migrator( | ||
{ | ||
"encoded_topic_urns": "topic", | ||
} | ||
) |
35 changes: 2 additions & 33 deletions
35
metadata-ingestion/src/datahub/ingestion/source/state/ldap_state.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,8 @@ | ||
from typing import Iterable, List | ||
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState | ||
|
||
import pydantic | ||
|
||
from datahub.ingestion.source.state.stale_entity_removal_handler import ( | ||
StaleEntityCheckpointStateBase, | ||
) | ||
from datahub.utilities.urns.urn import guess_entity_type | ||
|
||
|
||
class LdapCheckpointState(StaleEntityCheckpointStateBase["LdapCheckpointState"]): | ||
class LdapCheckpointState(GenericCheckpointState): | ||
""" | ||
Base class for representing the checkpoint state for all LDAP based sources. | ||
Stores all corpuser and corpGroup and being ingested and is used to remove any stale entities. | ||
""" | ||
|
||
urns: List[str] = pydantic.Field(default_factory=list) | ||
|
||
@classmethod | ||
def get_supported_types(cls) -> List[str]: | ||
return ["corpuser", "corpGroup"] | ||
|
||
def add_checkpoint_urn(self, type: str, urn: str) -> None: | ||
assert type in self.get_supported_types() | ||
self.urns.append(urn) | ||
|
||
def get_urns_not_in( | ||
self, type: str, other_checkpoint_state: "LdapCheckpointState" | ||
) -> Iterable[str]: | ||
assert type in self.get_supported_types() | ||
diff = set(self.urns) - set(other_checkpoint_state.urns) | ||
yield from (urn for urn in diff if guess_entity_type(urn) == type) | ||
|
||
def get_percent_entities_changed( | ||
self, old_checkpoint_state: "LdapCheckpointState" | ||
) -> float: | ||
return StaleEntityCheckpointStateBase.compute_percent_entities_changed( | ||
[(self.urns, old_checkpoint_state.urns)] | ||
) |
9 changes: 0 additions & 9 deletions
9
metadata-ingestion/src/datahub/ingestion/source/state/tableau_state.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.